In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBRegressor
%matplotlib inline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
train_df=pd.read_csv("./playground-series-s5e4/train.csv")
test_df=pd.read_csv("./playground-series-s5e4/test.csv")

In [5]:
train_df.isna().mean()*100

id                              0.000000
Podcast_Name                    0.000000
Episode_Title                   0.000000
Episode_Length_minutes         11.612400
Genre                           0.000000
Host_Popularity_percentage      0.000000
Publication_Day                 0.000000
Publication_Time                0.000000
Guest_Popularity_percentage    19.470667
Number_of_Ads                   0.000133
Episode_Sentiment               0.000000
Listening_Time_minutes          0.000000
dtype: float64

In [6]:
train_df['Episode_Length_minutes']=train_df['Episode_Length_minutes'].fillna(train_df['Episode_Length_minutes'].mean())
train_df['Guest_Popularity_percentage']=train_df['Guest_Popularity_percentage'].fillna(train_df['Guest_Popularity_percentage'].mean())

In [7]:
train_df.dropna(inplace=True)

In [8]:
train_df.isna().mean()*100

id                             0.0
Podcast_Name                   0.0
Episode_Title                  0.0
Episode_Length_minutes         0.0
Genre                          0.0
Host_Popularity_percentage     0.0
Publication_Day                0.0
Publication_Time               0.0
Guest_Popularity_percentage    0.0
Number_of_Ads                  0.0
Episode_Sentiment              0.0
Listening_Time_minutes         0.0
dtype: float64

In [9]:
test_df.isna().mean()*100

id                              0.0000
Podcast_Name                    0.0000
Episode_Title                   0.0000
Episode_Length_minutes         11.4944
Genre                           0.0000
Host_Popularity_percentage      0.0000
Publication_Day                 0.0000
Publication_Time                0.0000
Guest_Popularity_percentage    19.5328
Number_of_Ads                   0.0000
Episode_Sentiment               0.0000
dtype: float64

In [10]:
test_df['Episode_Length_minutes']=test_df['Episode_Length_minutes'].fillna(test_df['Episode_Length_minutes'].mean())
test_df['Guest_Popularity_percentage']=test_df['Guest_Popularity_percentage'].fillna(test_df['Guest_Popularity_percentage'].mean())

In [11]:
test_df.isna().mean()*100

id                             0.0
Podcast_Name                   0.0
Episode_Title                  0.0
Episode_Length_minutes         0.0
Genre                          0.0
Host_Popularity_percentage     0.0
Publication_Day                0.0
Publication_Time               0.0
Guest_Popularity_percentage    0.0
Number_of_Ads                  0.0
Episode_Sentiment              0.0
dtype: float64

### Please refer to my Podcast Listening time prediciton for EDA and other preprocessing steps

In [12]:
test_ids=test_df['id']

In [13]:
object_columns=[]
for i in train_df.select_dtypes(include=['object']):
    object_columns.append(i)

object_columns

['Podcast_Name',
 'Episode_Title',
 'Genre',
 'Publication_Day',
 'Publication_Time',
 'Episode_Sentiment']

In [14]:
le=LabelEncoder()
le_Podcast_Name=LabelEncoder()
le_Episode_Title=LabelEncoder()
le_Genre=LabelEncoder()
le_Publication_Day=LabelEncoder()
le_Publication_Time=LabelEncoder()
le_Episode_Sentiment=LabelEncoder()

In [15]:
#Training Data

train_df['Podcast_Name']=le_Podcast_Name.fit_transform(train_df['Podcast_Name'])
train_df['Episode_Title']=le_Episode_Title.fit_transform(train_df['Episode_Title'])
train_df['Genre']=le_Genre.fit_transform(train_df['Genre'])
train_df['Publication_Day']=le_Publication_Day.fit_transform(train_df['Publication_Day'])
train_df['Publication_Time']=le_Publication_Time.fit_transform(train_df['Publication_Time'])
train_df['Episode_Sentiment']=le_Episode_Sentiment.fit_transform(train_df['Episode_Sentiment'])

#Testing data

test_df['Podcast_Name']=le_Podcast_Name.fit_transform(test_df['Podcast_Name'])
test_df['Episode_Title']=le_Episode_Title.fit_transform(test_df['Episode_Title'])
test_df['Genre']=le_Genre.fit_transform(test_df['Genre'])
test_df['Publication_Day']=le_Publication_Day.fit_transform(test_df['Publication_Day'])
test_df['Publication_Time']=le_Publication_Time.fit_transform(test_df['Publication_Time'])
test_df['Episode_Sentiment']=le_Episode_Sentiment.fit_transform(test_df['Episode_Sentiment'])

In [16]:
train_df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,34,98,64.504738,9,74.81,4,3,52.236449,0.0,2,31.41998
1,1,24,19,119.8,1,66.95,2,0,75.95,2.0,0,88.01241
2,2,40,8,73.9,2,69.97,5,1,8.97,0.0,0,44.92531
3,3,10,40,67.17,8,57.22,1,2,78.7,2.0,2,46.27824
4,4,31,85,110.51,3,80.07,1,0,58.68,3.0,1,75.61031


In [17]:
test_df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,750000,11,71,78.96,2,38.11,2,1,53.33,1.0,1
1,750001,36,16,27.87,5,71.29,3,2,52.192796,0.0,1
2,750002,24,3,69.1,1,67.89,0,1,97.51,0.0,2
3,750003,4,71,115.39,1,23.4,3,2,51.75,2.0,2
4,750004,27,46,72.32,4,58.1,6,2,11.3,2.0,1


In [18]:
X=train_df.drop(columns=['Listening_Time_minutes'])
y=train_df['Listening_Time_minutes']

In [20]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
scaler=StandardScaler()

X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
test_df_scaled=scaler.transform(test_df)

In [28]:
linear_model=LinearRegression()

In [29]:
linear_model.fit(X_train, y_train)

linear_model_pred=linear_model.predict(X_test)

In [31]:
print("Root mean sqaured error:", np.sqrt(mean_squared_error(y_test, linear_model_pred)))

Root mean sqaured error: 46.35834886918247


In [32]:
linear_model.fit(X_train_scaled, y_train)

linear_model_pred1=linear_model.predict(X_test_scaled)

In [33]:
print("Root mean sqaured error:", np.sqrt(mean_squared_error(y_test, linear_model_pred1)))

Root mean sqaured error: 13.32971666544375


### Result of Linear Regression

In [34]:
dt_model=DecisionTreeRegressor()

dt_model.fit(X_train_scaled, y_train)

In [35]:
dt_model_pred=dt_model.predict(X_test_scaled)

In [36]:
print("Root mean sqaured error:", np.sqrt(mean_squared_error(y_test, dt_model_pred)))

Root mean sqaured error: 18.28370861408516


In [41]:
params = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_leaf_nodes': [None, 20, 50, 100]
}

randomized_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=params,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)

In [42]:
randomized_search.fit(X_train_scaled, y_train)
best_dt = randomized_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [43]:
dt_model_pred1=best_dt.predict(X_test_scaled)

In [45]:
print("Root mean sqaured error:", np.sqrt(mean_squared_error(y_test, dt_model_pred1)))

Root mean sqaured error: 13.17792003882689


### Result of DT Model after hyperparameter tuning

In [46]:
rf_model=RandomForestRegressor()

rf_model.fit(X_train, y_train)

In [47]:
rf_model_pred=rf_model.predict(X_test)

In [48]:
print("Root mean squared error:", np.sqrt(mean_squared_error(y_test, rf_model_pred)))

Root mean squared error: 12.79758379861451


In [49]:
rf_model.fit(X_train_scaled, y_train)

In [50]:
rf_model_pred1=rf_model.predict(X_test_scaled)

In [51]:
print("Root mean squared error:", np.sqrt(mean_squared_error(y_test, rf_model_pred1)))

Root mean squared error: 12.789055391071807


In [53]:
prediction11=rf_model.predict(test_df_scaled)

In [54]:
submission11=pd.DataFrame({'id': test_ids, 'Listening_Time_minutes': prediction11})
submission11=submission11.to_csv('submission11.csv', index=False)
print("Submission11 file created")

Submission11 file created


In [62]:
xgb_model=XGBRegressor()

params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid=RandomizedSearchCV(estimator=xgb_model, param_distributions=params, 
                    scoring='neg_root_mean_squared_error', 
                    cv=3, verbose=1)

In [63]:
grid.fit(X_train_scaled, y_train)

best_xgb = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 1, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.3, 'colsample_bytree': 1}


In [64]:
xgb_pred=best_xgb.predict(X_test_scaled)

In [65]:
print("Root mean squared error:", np.sqrt(mean_squared_error(y_test, xgb_pred)))

Root mean squared error: 12.97816750565349


In [66]:
prediction12=best_xgb.predict(test_df_scaled)

In [67]:
submission12=pd.DataFrame({'id': test_ids, 'Listening_Time_minutes': prediction12})
submission12=submission12.to_csv('submission12.csv', index=False)
print("Submission12 file created")

Submission12 file created
