In [3]:
# import libraries  and packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# About features of Dataset

Podcast_Name (Type: string)
Description: Names of popular podcasts.
Example Values: "Tech Talk", "Health Hour", "Comedy Central"

Episode_Title (Type: string)
Description: Titles of the podcast episodes.
Example Values: "The Future of AI", "Meditation Tips", "Stand-Up Special"

Episode_Length (Type: float, minutes)
Description: Length of the episode in minutes.
Example Values: 5.0, 10.0, 30.0, 45.0, 60.0, 90.0

Genre (Type: string)
Description: Genre of the podcast episode.
Possible Values: "Technology", "Education", "Comedy", "Health", "True Crime", "Business", "Sports", "Lifestyle", "News", "Music"

Host_Popularity (Type: float, scale 0-100)
Description: A score indicating the popularity of the host.
Example Values: 50.0, 75.0, 90.0

Publication_Day (Type: string)
Description: Day of the week the episode was published.
Possible Values: "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"

Publication_Time (Type: string)
Description: Time of the day the episode was published.
Possible Values: "Morning", "Afternoon", "Evening", "Night"

Guest_Popularity (Type: float, scale 0-100)
Description: A score indicating the popularity of the guest (if any).
Example Values: 20.0, 50.0, 85.0

Number_of_Ads (Type: int)
Description: Number of advertisements within the episode.
Example Values: 0, 1, 2, 3

Episode_Sentiment (Type: string)
Description: Sentiment of the episode's content.
Possible Values: "Positive", "Neutral", "Negative"

Listening_Time (Type: float, minutes)
Description: The actual average listening duration (target variable).
Example Values: 4.5, 8.0, 30.0, 60.0





In [4]:
#Load train dataset and gender_submission dataset into Dataframe
df = pd.read_csv("../Data/playground-series-s5e4/train.csv")


In [5]:
# view first 5 columns of train dataset
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [6]:
# view descriptive statistics of dataset
df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97


In [7]:
df.shape

(750000, 12)

In [8]:
# observe data types
df.dtypes

id                               int64
Podcast_Name                    object
Episode_Title                   object
Episode_Length_minutes         float64
Genre                           object
Host_Popularity_percentage     float64
Publication_Day                 object
Publication_Time                object
Guest_Popularity_percentage    float64
Number_of_Ads                  float64
Episode_Sentiment               object
Listening_Time_minutes         float64
dtype: object

In [9]:
# find missing values within dataset
df.isnull().sum()

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [None]:
df.duplicated().sum()

In [10]:
# Fill in  missing values for "Episode_Length_minutes","Guest_Popularity_percentage","Number_of_Ads"

df["Episode_Length_minutes"] = df.groupby("Podcast_Name")["Episode_Length_minutes"].transform(
    lambda x: x.fillna(x.median()))

df["Guest_Popularity_percentage"] = df.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform(
    lambda x: x.fillna(x.median()))


df["Number_of_Ads"] = df.groupby("Podcast_Name")["Number_of_Ads"].transform(
    lambda x: x.fillna(x.median()))


In [11]:

df.isnull().sum()

id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64

## Univariate Analysis

In [13]:
df_Genre =df.value_counts("Genre")
df_Genre.to_frame()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
Sports,87606
Technology,86256
True Crime,85059
Lifestyle,82461
Comedy,81453
Business,80521
Health,71416
News,63385
Music,62743
Education,49100
