# Exploratory analysis
Getting to grips with the data and potential valuable features.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
training = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [3]:
# Potentially interesting aspects at first glance:
# host and guest popularity could interact.
# Transform each into 0-1 and add a factor

# the criterium variable: listening time surely depends on the total length. A potentially useful approach is to 
# predict not the listening time, but the percentage listened! And then transform that based on new data's episode length to make final predictions.

pd.options.display.float_format = '{:.2f}'.format

In [4]:
training.describe(include = "all")

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
count,750000.0,750000,750000,662907.0,750000,750000.0,750000,750000,603970.0,749999.0,750000,750000.0
unique,,48,100,,10,,7,4,,,3,
top,,Tech Talks,Episode 71,,Sports,,Sunday,Night,,,Neutral,
freq,,22847,10515,,87606,,115946,196849,,,251291,
mean,374999.5,,,64.5,,59.86,,,52.24,1.35,,45.44
std,216506.5,,,32.97,,22.87,,,28.45,1.15,,27.14
min,0.0,,,0.0,,1.3,,,0.0,0.0,,0.0
25%,187499.75,,,35.73,,39.41,,,28.38,0.0,,23.18
50%,374999.5,,,63.84,,60.05,,,53.58,1.0,,43.38
75%,562499.25,,,94.07,,79.53,,,76.6,2.0,,64.81


In [5]:
test.describe(include = "all")

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
count,250000.0,250000,250000,221264.0,250000,250000.0,250000,250000,201168.0,250000.0,250000
unique,,48,100,,10,,7,4,,,3
top,,Tech Talks,Episode 71,,Sports,,Sunday,Night,,,Neutral
freq,,7553,3492,,28903,,38906,65440,,,83671
mean,874999.5,,,419.3,,59.72,,,52.19,1.36,
std,72168.93,,,166854.5,,22.88,,,28.45,4.27,
min,750000.0,,,2.47,,2.49,,,0.0,0.0,
25%,812499.75,,,35.78,,39.25,,,28.32,0.0,
50%,874999.5,,,63.97,,59.9,,,53.36,1.0,
75%,937499.25,,,94.15,,79.39,,,76.56,2.0,


In [6]:
# all variables have an OK-ish rate of completeness, none with lots of NA's.

# Lets see the titles, how many are in the format Episode X and whether that could be a useful feature.
starts_episode = [bool(re.search("^episode", title.lower())) for title in training.Episode_Title]

pd.Series(starts_episode).value_counts()
# ALL of them! super useful!

True    750000
Name: count, dtype: int64

In [7]:
# Extract the numeric part of Episode X

episodes_no = [int(re.findall(r"\d+", title)[0]) for title in training.Episode_Title]

training["episode_no"] = episodes_no
training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.42,98
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01,26
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.93,16
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.28,45
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61,86


In [8]:
# Host x Guest popularity factor
training["Host_Popularity_percentage"] = training["Host_Popularity_percentage"]/100
training["Guest_Popularity_percentage"] = training["Guest_Popularity_percentage"]/100

training["Host_Guest_combo_percentage"] = training["Host_Popularity_percentage"] * training["Guest_Popularity_percentage"]

# Even a different metric, like harmonic mean, that leans towards the lower rating could be fun

training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no,Host_Guest_combo_percentage
0,0,Mystery Matters,Episode 98,,True Crime,0.75,Thursday,Night,,0.0,Positive,31.42,98,
1,1,Joke Junction,Episode 26,119.8,Comedy,0.67,Saturday,Afternoon,0.76,2.0,Negative,88.01,26,0.51
2,2,Study Sessions,Episode 16,73.9,Education,0.7,Tuesday,Evening,0.09,0.0,Negative,44.93,16,0.06
3,3,Digital Digest,Episode 45,67.17,Technology,0.57,Monday,Morning,0.79,2.0,Positive,46.28,45,0.45
4,4,Mind & Body,Episode 86,110.51,Health,0.8,Monday,Afternoon,0.59,3.0,Neutral,75.61,86,0.47


In [9]:
# weirdness in data - some rows have more listened time than episode length
training.loc[training.Listening_Time_minutes > training.Episode_Length_minutes]

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no,Host_Guest_combo_percentage
113,113,Current Affairs,Episode 44,17.94,News,0.97,Saturday,Morning,0.13,1.00,Neutral,17.94,44,0.13
407,407,Sound Waves,Episode 56,11.26,Music,0.44,Wednesday,Night,0.66,0.00,Positive,11.27,56,0.29
864,864,Life Lessons,Episode 29,28.92,Lifestyle,0.50,Wednesday,Morning,0.20,2.00,Neutral,28.92,29,0.10
948,948,Educational Nuggets,Episode 59,96.11,Education,0.86,Tuesday,Evening,0.91,2.00,Neutral,96.11,59,0.78
1316,1316,Market Masters,Episode 71,64.08,Business,0.31,Sunday,Night,0.28,0.00,Positive,68.64,71,0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749074,749074,Gadget Geek,Episode 90,56.05,Technology,0.54,Monday,Morning,0.68,0.00,Positive,60.12,90,0.37
749554,749554,Educational Nuggets,Episode 37,43.08,Education,0.39,Wednesday,Afternoon,0.59,0.00,Negative,44.56,37,0.23
749622,749622,Lifestyle Lounge,Episode 8,52.69,Lifestyle,0.91,Thursday,Evening,,2.00,Positive,52.70,8,
749649,749649,News Roundup,Episode 23,45.39,News,0.96,Saturday,Morning,,1.00,Neutral,46.61,23,


In [10]:
# Not a small number. We could fix those values to be equal to the max length.
# for the beginning, leave as is. We could explore if this improves predictability - is having a longer listening% feasible?


# Also, there's one row with 0 episode length. We'll set that to 0.01 first.

training.loc[training.Episode_Length_minutes == 0, "Episode_Length_minutes"]  = 0.01



In [11]:
training.loc[training.Episode_Length_minutes == 0]

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no,Host_Guest_combo_percentage


In [12]:
# Listened percentage of total length

training["Listened_percent"] = training["Listening_Time_minutes"] / training["Episode_Length_minutes"]
training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no,Host_Guest_combo_percentage,Listened_percent
0,0,Mystery Matters,Episode 98,,True Crime,0.75,Thursday,Night,,0.0,Positive,31.42,98,,
1,1,Joke Junction,Episode 26,119.8,Comedy,0.67,Saturday,Afternoon,0.76,2.0,Negative,88.01,26,0.51,0.73
2,2,Study Sessions,Episode 16,73.9,Education,0.7,Tuesday,Evening,0.09,0.0,Negative,44.93,16,0.06,0.61
3,3,Digital Digest,Episode 45,67.17,Technology,0.57,Monday,Morning,0.79,2.0,Positive,46.28,45,0.45,0.69
4,4,Mind & Body,Episode 86,110.51,Health,0.8,Monday,Afternoon,0.59,3.0,Neutral,75.61,86,0.47,0.68


In [13]:
# Check out which categorical variables have most impact on listening
# - this is where the listened percentages are more indicative

training.groupby(["Genre"], dropna = True)[["Listened_percent", "Listening_Time_minutes"]].mean()

# The genre doesn't seem to have a big impact

Unnamed: 0_level_0,Listened_percent,Listening_Time_minutes
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,0.68,45.54
Comedy,0.68,44.43
Education,0.68,45.74
Health,0.68,45.74
Lifestyle,0.69,45.52
Music,0.68,46.58
News,0.67,44.41
Sports,0.67,44.94
Technology,0.69,45.63
True Crime,0.69,46.04


In [19]:
# Count how many episodes of each podcast we have
# if there's a low number of ep, any aggregate statistics will be skewed
training.groupby(["Podcast_Name"], dropna = True)[["Listening_Time_minutes"]].count().sort_values(by = "Listening_Time_minutes", ascending = False)


# Having such a large number of rows for each makes me rethink the structure of the data - rows aren't individual episodes, but listeners?


Unnamed: 0_level_0,Listening_Time_minutes
Podcast_Name,Unnamed: 1_level_1
Tech Talks,22847
Sports Weekly,20053
Funny Folks,19635
Tech Trends,19549
Fitness First,19488
Business Insights,19480
Style Guide,19364
Game Day,19272
Melody Mix,18889
Criminal Minds,17735


In [26]:
training.loc[training.Podcast_Name == "News Roundup"].sort_values("Episode_Title")

# So "the same show" has LOTS of episodes named the same, with varying lenght, publication time etc.
# I think this is an artifact of synthetic data generation. So we SHOULD treat each row as its own episode and not worry about
# how this would realistically occur.

# This also means that "duplicated" rows (where everything except ID is the same) should NOT be excluded. They represent two episodes with same features,
# which should give more weight to parameters that correctly predict their listening time.

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no,Host_Guest_combo_percentage,Listened_percent
628471,628471,News Roundup,Episode 1,94.30,News,0.96,Tuesday,Night,,1.00,Negative,82.82,1,,0.88
243638,243638,News Roundup,Episode 1,,News,0.41,Tuesday,Evening,0.29,0.00,Positive,77.33,1,0.12,
7125,7125,News Roundup,Episode 1,52.34,News,0.78,Friday,Morning,0.43,0.00,Positive,34.44,1,0.34,0.66
237646,237646,News Roundup,Episode 1,,News,0.37,Wednesday,Night,0.31,2.00,Negative,58.14,1,0.12,
640272,640272,News Roundup,Episode 1,89.16,News,0.71,Tuesday,Night,0.76,2.00,Positive,56.93,1,0.54,0.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321116,321116,News Roundup,Episode 99,92.29,News,0.30,Saturday,Night,0.28,1.00,Positive,83.34,99,0.08,0.90
678303,678303,News Roundup,Episode 99,14.63,News,0.99,Monday,Night,0.80,1.00,Negative,3.70,99,0.79,0.25
271159,271159,News Roundup,Episode 99,28.26,News,0.52,Thursday,Afternoon,0.83,3.00,Negative,13.34,99,0.43,0.47
296283,296283,News Roundup,Episode 99,72.17,News,0.56,Thursday,Evening,0.56,0.00,Neutral,64.93,99,0.31,0.90


In [17]:
# Individual podcasts
training.groupby(["Podcast_Name"], dropna = True)[["Listened_percent", "Listening_Time_minutes"]].mean().sort_values("Listened_percent", ascending = False)


# Catching the "top-performers" in a yes/no variable could be useful, accounting for
# some third factor not explained by other variables.

# the percentage is more useful here since some podcasts have shorter episodes on average

# account

Unnamed: 0_level_0,Listened_percent,Listening_Time_minutes
Podcast_Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Fashion Forward,0.73,45.46
Tech Talks,0.71,46.16
True Crime Stories,0.7,46.26
Gadget Geek,0.7,47.03
Crime Chronicles,0.69,47.68
Detective Diaries,0.69,46.76
Money Matters,0.69,47.88
Funny Folks,0.69,45.09
Tune Time,0.69,46.9
Athlete's Arena,0.69,46.63
