# Data cleaning 

In [1]:
# Modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load DataFrames
Podcast_Train_df = pd.read_csv('Data/train.csv')
Podcast_Test_df = pd.read_csv('Data/test.csv')

In [3]:
# to make the preprocessing at the same time
Podcast_Train_df['is_train'] = 1
Podcast_Test_df['is_train'] = 0

In [4]:
# join the both DataFrames
Podcast_df = pd.concat([Podcast_Train_df,Podcast_Test_df])

In [5]:
# 1x10^6 rows
Podcast_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 13 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1000000 non-null  int64  
 1   Podcast_Name                 1000000 non-null  object 
 2   Episode_Title                1000000 non-null  object 
 3   Episode_Length_minutes       884171 non-null   float64
 4   Genre                        1000000 non-null  object 
 5   Host_Popularity_percentage   1000000 non-null  float64
 6   Publication_Day              1000000 non-null  object 
 7   Publication_Time             1000000 non-null  object 
 8   Guest_Popularity_percentage  805138 non-null   float64
 9   Number_of_Ads                999999 non-null   float64
 10  Episode_Sentiment            1000000 non-null  object 
 11  Listening_Time_minutes       750000 non-null   float64
 12  is_train                     1000000 non-null  i

## Cleaning process 
* Episode_Title: Change it  with the number
* Episode_Length_minutes: maximum time 121 minutes NaN values average?
* Host_Popularity_percentage: maximum 100%
* Guest_Popularity_percentage: maximum 100%
* Number_of_Ads: only allowed [0, 1, 2, 3], other values removed
* 

In [6]:
Podcast_df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,is_train
count,1000000.0,884171.0,1000000.0,805138.0,999999.0,750000.0,1000000.0
mean,499999.5,153.292,59.824048,52.225542,1.350604,45.437406,0.75
std,288675.278932,83469.0,22.874903,28.449679,2.358272,27.138306,0.433013
min,0.0,0.0,1.3,0.0,0.0,0.0,0.0
25%,249999.75,35.74,39.37,28.37,0.0,23.17835,0.75
50%,499999.5,63.87,60.02,53.54,1.0,43.37946,1.0
75%,749999.25,94.08,79.49,76.59,2.0,64.81158,1.0
max,999999.0,78486260.0,119.46,119.91,2063.0,119.97,1.0


In [7]:
# Ads NaN ->0
Podcast_df["Number_of_Ads"] = Podcast_df["Number_of_Ads"].fillna(0)

In [8]:
# Define the list of correct values
correct_values = [0.00, 1.00, 2.00, 3.00]

# Create a boolean mask to identify erroneous values
# Values that are NOT in the correct list 
erroneous_mask = ~Podcast_df["Number_of_Ads"].isin(correct_values) 
Podcast_df.loc[erroneous_mask, "Number_of_Ads"] = 1.00

In [9]:
# Change the Host_Popularity_percentage maximun by 100
threshold_host = 100.00
Podcast_df.loc[Podcast_df['Host_Popularity_percentage']>threshold_host, 'Host_Popularity_percentage']= threshold_host

In [10]:
# Change the Guest_Popularity_percentage maximun by 100
threshold_Guest = 100.00
Podcast_df.loc[Podcast_df['Guest_Popularity_percentage']>threshold_Guest, 'Guest_Popularity_percentage']= threshold_Guest

In [12]:
# Change the Guest_Popularity_percentage maximun by 100
threshold_min = 121.00
Podcast_df.loc[Podcast_df['Episode_Length_minutes']>threshold_min, 'Episode_Length_minutes']= threshold_min

In [13]:
Podcast_df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,is_train
count,1000000.0,884171.0,1000000.0,805138.0,1000000.0,750000.0,1000000.0
mean,499999.5,64.515301,59.823641,52.225207,1.347722,45.437406,0.75
std,288675.278932,32.964416,22.874055,28.449038,1.111388,27.138306,0.433013
min,0.0,0.0,1.3,0.0,0.0,0.0,0.0
25%,249999.75,35.74,39.37,28.37,0.0,23.17835,0.75
50%,499999.5,63.87,60.02,53.54,1.0,43.37946,1.0
75%,749999.25,94.08,79.49,76.59,2.0,64.81158,1.0
max,999999.0,121.0,100.0,100.0,3.0,119.97,1.0
