In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv("./inputs/train.csv").set_index("id")
train_df.drop("Listening_Time_minutes", axis=1, inplace=True)
train_df

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive
1,Joke Junction,Episode 26,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative
2,Study Sessions,Episode 16,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral
...,...,...,...,...,...,...,...,...,...,...
749995,Learning Lab,Episode 25,75.66,Education,69.36,Saturday,Morning,,0.0,Negative
749996,Business Briefs,Episode 21,75.75,Business,35.21,Saturday,Night,,2.0,Neutral
749997,Lifestyle Lounge,Episode 51,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative
749998,Style Guide,Episode 47,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative


In [3]:
# Number of Unique values per column
for i in train_df:
    print(f"The column {i} has {len(train_df[i].unique())} unique values.")

The column Podcast_Name has 48 unique values.
The column Episode_Title has 100 unique values.
The column Episode_Length_minutes has 12269 unique values.
The column Genre has 10 unique values.
The column Host_Popularity_percentage has 8038 unique values.
The column Publication_Day has 7 unique values.
The column Publication_Time has 4 unique values.
The column Guest_Popularity_percentage has 10020 unique values.
The column Number_of_Ads has 13 unique values.
The column Episode_Sentiment has 3 unique values.


From this, we can see that there aren't any distinctly unique values in any column like price values.
But we don't necessarily need to know the unique values for columns with numerical values like
- Episode_Length_minutes
- Host_Popularity_percentage
- Guest_Popularity_percentage
- Number_of_Ads
- Listening_Time_minutes (Will not be included since it is the target for prediction)

The rest of the columns can be assumed to be categorical columns.

In [4]:
non_categories = ["Episode_Length_minutes",
                  "Host_Popularity_percentage",
                  "Guest_Popularity_percentage",
                  "Number_of_Ads"]
categories = [column for column in train_df if column not in non_categories]
display(non_categories)
display(categories)
%store non_categories
%store categories

['Episode_Length_minutes',
 'Host_Popularity_percentage',
 'Guest_Popularity_percentage',
 'Number_of_Ads']

['Podcast_Name',
 'Episode_Title',
 'Genre',
 'Publication_Day',
 'Publication_Time',
 'Episode_Sentiment']

Stored 'non_categories' (list)
Stored 'categories' (list)


In [5]:
# Columns that have NaN values
for i in train_df:
    print(f"Column {i} has {train_df[i].isna().sum()} NaN values.")

Column Podcast_Name has 0 NaN values.
Column Episode_Title has 0 NaN values.
Column Episode_Length_minutes has 87093 NaN values.
Column Genre has 0 NaN values.
Column Host_Popularity_percentage has 0 NaN values.
Column Publication_Day has 0 NaN values.
Column Publication_Time has 0 NaN values.
Column Guest_Popularity_percentage has 146030 NaN values.
Column Number_of_Ads has 1 NaN values.
Column Episode_Sentiment has 0 NaN values.


In [6]:
# How many non_categorical columns have NaN values
for i in non_categories:
    if train_df[i].isna().sum() > 0:
        print(i)

Episode_Length_minutes
Guest_Popularity_percentage
Number_of_Ads


In [7]:
# How many categorical columns have NaN values
for i in categories:
    if train_df[i].isna().sum() > 0:
        print(i)

From here, the cell below are just me trying to get an understanding of the data

In [8]:
train_df

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive
1,Joke Junction,Episode 26,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative
2,Study Sessions,Episode 16,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral
...,...,...,...,...,...,...,...,...,...,...
749995,Learning Lab,Episode 25,75.66,Education,69.36,Saturday,Morning,,0.0,Negative
749996,Business Briefs,Episode 21,75.75,Business,35.21,Saturday,Night,,2.0,Neutral
749997,Lifestyle Lounge,Episode 51,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative
749998,Style Guide,Episode 47,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative


In [9]:
arr = []
train_df = train_df.copy()  # To not make changes to the existing dataset
train_df_no_na = train_df.dropna()

train_df_no_na.groupby(["Publication_Day"])["Guest_Popularity_percentage"].mean().reset_index()

Unnamed: 0,Publication_Day,Guest_Popularity_percentage
0,Friday,52.669482
1,Monday,51.743177
2,Saturday,52.353007
3,Sunday,52.185807
4,Thursday,52.015044
5,Tuesday,52.246766
6,Wednesday,52.44516


In [10]:
train_df_no_na.groupby(["Publication_Day"])["Host_Popularity_percentage"].mean().reset_index()

Unnamed: 0,Publication_Day,Host_Popularity_percentage
0,Friday,60.006828
1,Monday,59.58621
2,Saturday,60.147383
3,Sunday,59.396682
4,Thursday,59.389569
5,Tuesday,59.777252
6,Wednesday,59.841053


In [11]:
train_df_no_na.groupby(["Publication_Time"])["Guest_Popularity_percentage"].mean().reset_index()

Unnamed: 0,Publication_Time,Guest_Popularity_percentage
0,Afternoon,52.58228
1,Evening,52.188156
2,Morning,52.195452
3,Night,52.000025


In [12]:
train_df_no_na.groupby(["Publication_Time"])["Host_Popularity_percentage"].mean().reset_index()

Unnamed: 0,Publication_Time,Host_Popularity_percentage
0,Afternoon,59.644296
1,Evening,59.961219
2,Morning,59.678692
3,Night,59.624257


In [13]:
train_df.groupby(["Publication_Day", "Publication_Time"])["Host_Popularity_percentage"].mean().reset_index()

Unnamed: 0,Publication_Day,Publication_Time,Host_Popularity_percentage
0,Friday,Afternoon,60.055941
1,Friday,Evening,60.270606
2,Friday,Morning,60.039549
3,Friday,Night,60.156807
4,Monday,Afternoon,59.714208
5,Monday,Evening,59.909541
6,Monday,Morning,59.652435
7,Monday,Night,59.707558
8,Saturday,Afternoon,60.152333
9,Saturday,Evening,60.429721
