This is a notebook for training using CatBoostRegressor (CBR).

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from catboost import CatBoostRegressor

In [7]:
# Load the data
train_df = pd.read_csv('./inputs/train.csv').set_index("id")
target = train_df["Listening_Time_minutes"]
train_df.drop("Listening_Time_minutes", axis=1, inplace=True)
# train_df.drop("Episode_Title", axis=1, inplace=True)
train_df

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive
1,Joke Junction,Episode 26,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative
2,Study Sessions,Episode 16,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral
...,...,...,...,...,...,...,...,...,...,...
749995,Learning Lab,Episode 25,75.66,Education,69.36,Saturday,Morning,,0.0,Negative
749996,Business Briefs,Episode 21,75.75,Business,35.21,Saturday,Night,,2.0,Neutral
749997,Lifestyle Lounge,Episode 51,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative
749998,Style Guide,Episode 47,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative


In [3]:
# Encode the categorical columns
%store -r categories
# categories = categories[:1] + categories[2:]
display(categories)
encoder = LabelEncoder()
for column in categories:
    train_df[column] = encoder.fit_transform(train_df[column])

train_df[categories]

['Podcast_Name',
 'Episode_Title',
 'Genre',
 'Publication_Day',
 'Publication_Time',
 'Episode_Sentiment']

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Genre,Publication_Day,Publication_Time,Episode_Sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,34,98,9,4,3,2
1,24,19,1,2,0,0
2,40,8,2,5,1,0
3,10,40,8,1,2,2
4,31,85,3,1,0,1
...,...,...,...,...,...,...
749995,26,18,2,2,2,0
749996,2,14,0,2,3,1
749997,28,47,4,4,2,0
749998,41,42,4,4,2,0


In [4]:
# Scale the data
scaler = StandardScaler()
train_df_scaled = scaler.fit_transform(train_df)
train_df_scaled

array([[ 0.74158935,  1.69278629,         nan, ...,         nan,
        -1.1717663 ,  1.22882316],
       [ 0.03425408, -1.10995773,  1.6771601 , ...,  0.83348105,
         0.56565749, -1.22384261],
       [ 1.16599051, -1.50021322,  0.2849676 , ..., -1.52072397,
        -1.1717663 , -1.22384261],
       ...,
       [ 0.31718819, -0.1165801 , -1.01683855, ...,  1.14770311,
        -1.1717663 , -1.22384261],
       [ 1.23672404, -0.29396896,  1.34897877, ...,  1.44224234,
        -1.1717663 , -1.22384261],
       [ 1.02452346,  1.72826406, -1.22551577, ..., -0.54537029,
        -1.1717663 ,  0.00249027]])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_df_scaled, target, test_size=0.2, random_state=42)

In [6]:
# Create the model
model = CatBoostRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = root_mean_squared_error(y_test, y_pred)
print(score)

Learning rate set to 0.112494
0:	learn: 24.9669820	total: 342ms	remaining: 5m 41s
1:	learn: 23.1021179	total: 411ms	remaining: 3m 25s
2:	learn: 21.4960557	total: 467ms	remaining: 2m 35s
3:	learn: 20.1208197	total: 522ms	remaining: 2m 10s
4:	learn: 18.9532453	total: 578ms	remaining: 1m 55s
5:	learn: 17.9774567	total: 638ms	remaining: 1m 45s
6:	learn: 17.1489694	total: 694ms	remaining: 1m 38s
7:	learn: 16.4650521	total: 748ms	remaining: 1m 32s
8:	learn: 15.8885954	total: 805ms	remaining: 1m 28s
9:	learn: 15.4098046	total: 865ms	remaining: 1m 25s
10:	learn: 15.0205813	total: 924ms	remaining: 1m 23s
11:	learn: 14.6996759	total: 988ms	remaining: 1m 21s
12:	learn: 14.4305693	total: 1.04s	remaining: 1m 19s
13:	learn: 14.2159974	total: 1.1s	remaining: 1m 17s
14:	learn: 14.0404095	total: 1.16s	remaining: 1m 16s
15:	learn: 13.8935467	total: 1.21s	remaining: 1m 14s
16:	learn: 13.7762655	total: 1.29s	remaining: 1m 14s
17:	learn: 13.6822310	total: 1.36s	remaining: 1m 14s
18:	learn: 13.6041210	total

In [7]:
# Trying to find the best parameters for the CatBoost Model using GridSearchCV
parameter = {
    "random_state": [21, 34, 42, 50, 1],
}
base_model = CatBoostRegressor(n_estimators=1000, learning_rate=0.11249399930238724, max_depth=6)
grid = GridSearchCV(estimator=base_model, param_grid=parameter, scoring='neg_root_mean_squared_error', n_jobs=-1,
                    verbose=3)
grid.fit(X_train, y_train)
predictions = grid.predict(X_test)
score = root_mean_squared_error(y_test, y_pred)
print(score)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: 

In [61]:
grid.best_params_

{'random_state': 1}

In [84]:
grid.best_score_

-13.176652827020103

In [8]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'lear