In [222]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('train.csv')
df.head()

test_1 = pd.read_csv('test.csv')

In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [224]:
(df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

Guest_Popularity_percentage    19.470667
Episode_Length_minutes         11.612400
Number_of_Ads                   0.000133
id                              0.000000
Episode_Title                   0.000000
Podcast_Name                    0.000000
Genre                           0.000000
Host_Popularity_percentage      0.000000
Publication_Time                0.000000
Publication_Day                 0.000000
Episode_Sentiment               0.000000
Listening_Time_minutes          0.000000
dtype: float64

In [225]:
df['Podcast_Episode'] = df['Podcast_Name'] + ' - ' + df['Episode_Title']
df['Podcast_Episode'].unique()

df = df.drop('id', axis=1)


df_test = df.drop('Guest_Popularity_percentage', axis=1)
df_test = df_test.drop(['Podcast_Name', 'Episode_Title'], axis=1)

split

In [211]:
y = df_test['Listening_Time_minutes']
X = df_test.drop('Listening_Time_minutes', axis=1)

In [212]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [213]:
categorical_cols = [
    'Podcast_Episode', 'Publication_Time', 'Genre',
    'Publication_Day', 'Episode_Sentiment'
]

for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')


In [214]:
X_train['Number_of_Ads'] = X_train['Number_of_Ads'].fillna(X_train['Number_of_Ads'].median())
X_test['Number_of_Ads'] = X_test['Number_of_Ads'].fillna(X_test['Number_of_Ads'].median())

X_train['Episode_Length_minutes'] = X_train['Episode_Length_minutes'].fillna(X_train['Episode_Length_minutes'].median())
X_test['Episode_Length_minutes'] = X_test['Episode_Length_minutes'].fillna(X_test['Episode_Length_minutes'].median())

In [226]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 600000 entries, 453635 to 121958
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   Episode_Length_minutes      600000 non-null  float64 
 1   Genre                       600000 non-null  category
 2   Host_Popularity_percentage  600000 non-null  float64 
 3   Publication_Day             600000 non-null  category
 4   Publication_Time            600000 non-null  category
 5   Number_of_Ads               600000 non-null  float64 
 6   Episode_Sentiment           600000 non-null  category
 7   Podcast_Episode             600000 non-null  category
dtypes: category(5), float64(3)
memory usage: 21.9 MB


In [216]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

# Define RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# Define model
model = LGBMRegressor(random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [25, 50 ,100],
    'max_depth': [ 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.5],
    'num_leaves': [ 50, 60, 70]
}

# Grid search
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5222
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 8
[LightGBM] [Info] Start training from score 45.447808


In [227]:
print("rmse_scorer:", -grid.best_score_)  # convert back to positive RMSE
print("Best Parameters:", grid.best_params_)

rmse_scorer: 13.174135724053448
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'num_leaves': 70}


In [228]:
best_model = LGBMRegressor(**grid.best_params_, random_state=42)
best_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5222
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 8
[LightGBM] [Info] Start training from score 45.447808


In [229]:
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse


np.float64(13.089633832019228)

In [233]:
test_1 = pd.read_csv('test.csv')
test_ids = test_1['id']


test_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           250000 non-null  int64  
 1   Podcast_Name                 250000 non-null  object 
 2   Episode_Title                250000 non-null  object 
 3   Episode_Length_minutes       221264 non-null  float64
 4   Genre                        250000 non-null  object 
 5   Host_Popularity_percentage   250000 non-null  float64
 6   Publication_Day              250000 non-null  object 
 7   Publication_Time             250000 non-null  object 
 8   Guest_Popularity_percentage  201168 non-null  float64
 9   Number_of_Ads                250000 non-null  float64
 10  Episode_Sentiment            250000 non-null  object 
dtypes: float64(4), int64(1), object(6)
memory usage: 21.0+ MB


In [234]:
test_1['Podcast_Episode'] = test_1['Podcast_Name'] + ' - ' + test_1['Episode_Title']
test_1 = test_1.drop('id', axis=1)
test_1 = test_1.drop(['Guest_Popularity_percentage', 'Podcast_Name', 'Episode_Title'], axis=1) 

test_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Episode_Length_minutes      221264 non-null  float64
 1   Genre                       250000 non-null  object 
 2   Host_Popularity_percentage  250000 non-null  float64
 3   Publication_Day             250000 non-null  object 
 4   Publication_Time            250000 non-null  object 
 5   Number_of_Ads               250000 non-null  float64
 6   Episode_Sentiment           250000 non-null  object 
 7   Podcast_Episode             250000 non-null  object 
dtypes: float64(3), object(5)
memory usage: 15.3+ MB


In [235]:
categorical_cols = [
    'Podcast_Episode', 'Publication_Time', 'Genre',
    'Publication_Day', 'Episode_Sentiment'
]

for col in categorical_cols:
    test_1[col] = test_1[col].astype('category')
    
test_1['Number_of_Ads'] = test_1['Number_of_Ads'].fillna(test_1['Number_of_Ads'].median())
test_1['Episode_Length_minutes'] = test_1['Episode_Length_minutes'].fillna(test_1['Episode_Length_minutes'].median())

In [236]:
y_pred = best_model.predict(test_1)

In [237]:
submission = pd.DataFrame({
    'id': test_ids,
    'Listening_Time_minutes': y_pred
})

submission.to_csv('submission.csv', index=False)