# Exploratory analysis
Getting to grips with the data and potential valuable features.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
training = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [3]:
# Potentially interesting aspects at first glance:
# host and guest popularity could interact.
# Transform each into 0-1 and add a factor

# the criterium variable: listening time surely depends on the total length. A potentially useful approach is to 
# predict not the listening time, but the percentage listened! And then transform that based on new data's episode length to make final predictions.

pd.options.display.float_format = '{:.2f}'.format

In [4]:
training.describe(include = "all")

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
count,750000.0,750000,750000,662907.0,750000,750000.0,750000,750000,603970.0,749999.0,750000,750000.0
unique,,48,100,,10,,7,4,,,3,
top,,Tech Talks,Episode 71,,Sports,,Sunday,Night,,,Neutral,
freq,,22847,10515,,87606,,115946,196849,,,251291,
mean,374999.5,,,64.5,,59.86,,,52.24,1.35,,45.44
std,216506.5,,,32.97,,22.87,,,28.45,1.15,,27.14
min,0.0,,,0.0,,1.3,,,0.0,0.0,,0.0
25%,187499.75,,,35.73,,39.41,,,28.38,0.0,,23.18
50%,374999.5,,,63.84,,60.05,,,53.58,1.0,,43.38
75%,562499.25,,,94.07,,79.53,,,76.6,2.0,,64.81


In [5]:
test.describe(include = "all")

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
count,250000.0,250000,250000,221264.0,250000,250000.0,250000,250000,201168.0,250000.0,250000
unique,,48,100,,10,,7,4,,,3
top,,Tech Talks,Episode 71,,Sports,,Sunday,Night,,,Neutral
freq,,7553,3492,,28903,,38906,65440,,,83671
mean,874999.5,,,419.3,,59.72,,,52.19,1.36,
std,72168.93,,,166854.5,,22.88,,,28.45,4.27,
min,750000.0,,,2.47,,2.49,,,0.0,0.0,
25%,812499.75,,,35.78,,39.25,,,28.32,0.0,
50%,874999.5,,,63.97,,59.9,,,53.36,1.0,
75%,937499.25,,,94.15,,79.39,,,76.56,2.0,


In [6]:
# all variables have an OK-ish rate of completeness, none with lots of NA's.

# Lets see the titles, how many are in the format Episode X and whether that could be a useful feature.
starts_episode = [bool(re.search("^episode", title.lower())) for title in training.Episode_Title]

pd.Series(starts_episode).value_counts()
# ALL of them! super useful!

True    750000
Name: count, dtype: int64

In [7]:
# Extract the numeric part of Episode X

episodes_no = [int(re.findall(r"\d+", title)[0]) for title in training.Episode_Title]

training["episode_no"] = episodes_no
training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.42,98
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01,26
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.93,16
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.28,45
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61,86


In [8]:
# Host x Guest popularity factor
training["Host_Popularity_percentage"] = training["Host_Popularity_percentage"]/100
training["Guest_Popularity_percentage"] = training["Guest_Popularity_percentage"]/100

training["Host_Guest_combo_percentage"] = training["Host_Popularity_percentage"] * training["Guest_Popularity_percentage"]

# Even a different metric, like harmonic mean, that leans towards the lower rating could be fun

training.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,episode_no,Host_Guest_combo_percentage
0,0,Mystery Matters,Episode 98,,True Crime,0.75,Thursday,Night,,0.0,Positive,31.42,98,
1,1,Joke Junction,Episode 26,119.8,Comedy,0.67,Saturday,Afternoon,0.76,2.0,Negative,88.01,26,0.51
2,2,Study Sessions,Episode 16,73.9,Education,0.7,Tuesday,Evening,0.09,0.0,Negative,44.93,16,0.06
3,3,Digital Digest,Episode 45,67.17,Technology,0.57,Monday,Morning,0.79,2.0,Positive,46.28,45,0.45
4,4,Mind & Body,Episode 86,110.51,Health,0.8,Monday,Afternoon,0.59,3.0,Neutral,75.61,86,0.47


In [9]:
# Got to split our training to have a validation sample to estimate our model's success