# Initial direct model (xgb) GridSearchCV

In [19]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import GridSearchCV 
from sklearn import preprocessing 
import sklearn as sk
from sklearn.svm import SVR
import xgboost as xgb

In [2]:
# Load DataFrames
Podcast_Train_df = pd.read_csv('Data/train.csv')
Podcast_Test_df = pd.read_csv('Data/test.csv')

In [3]:
# to make the preprocessing at the same time
Podcast_Train_df['is_train'] = 1
Podcast_Test_df['is_train'] = 0

In [5]:
Podcast_df = pd.concat([Podcast_Train_df,Podcast_Test_df])

### The features 
* 'Episode_Length_minutes' has NaN values (average this time)
* 'Guest_Popularity_percentage' has NaN values "no guest" (=0 this time)
* 'Number_of_Ads' has one NaN, but also error values. (goes only from 0 to 3 'int'. NaN by 0, missing values by 1)
* xgb accepts categorical features. 

In [6]:
Podcast_df["Number_of_Ads"] = Podcast_df["Number_of_Ads"].fillna(0)

In [7]:
# Define the list of correct values
correct_values = [0.00, 1.00, 2.00, 3.00]

# Create a boolean mask to identify erroneous values
# Values that are NOT in the correct list 
erroneous_mask = ~Podcast_df["Number_of_Ads"].isin(correct_values) 
Podcast_df.loc[erroneous_mask, "Number_of_Ads"] = 1.00


In [8]:
# Change episode number to a number
Podcast_df["Episode_Number"] = Podcast_df["Episode_Title"].str.extract(r"(\d+)").astype(float)
Podcast_df['Guest_Popularity_percentage'] = Podcast_df['Guest_Popularity_percentage'].fillna(0) 

In [9]:
Podcast_df["Episode_Length_minutes"] = Podcast_df["Episode_Length_minutes"].fillna(Podcast_df["Episode_Length_minutes"].median())

In [10]:
Podcast_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 14 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1000000 non-null  int64  
 1   Podcast_Name                 1000000 non-null  object 
 2   Episode_Title                1000000 non-null  object 
 3   Episode_Length_minutes       1000000 non-null  float64
 4   Genre                        1000000 non-null  object 
 5   Host_Popularity_percentage   1000000 non-null  float64
 6   Publication_Day              1000000 non-null  object 
 7   Publication_Time             1000000 non-null  object 
 8   Guest_Popularity_percentage  1000000 non-null  float64
 9   Number_of_Ads                1000000 non-null  float64
 10  Episode_Sentiment            1000000 non-null  object 
 11  Listening_Time_minutes       750000 non-null   float64
 12  is_train                     1000000 non-null  i

In [11]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_df[col] = Podcast_df[col].astype('category')

In [12]:
# Drop unused
Podcast_df.drop(columns=["Episode_Title"], inplace=True)

In [13]:
# Split back Podcast_df
Podcast_Train_df = Podcast_df[Podcast_df['is_train']== 1].drop(columns=["is_train"])
Podcast_Test_df = Podcast_df[Podcast_df['is_train']== 0].drop(columns=["is_train", "Listening_Time_minutes"])

In [14]:
# Model introduction.
X = Podcast_Train_df.drop(columns=["Listening_Time_minutes", "id"])
y = Podcast_Train_df["Listening_Time_minutes"]

In [27]:
xgb.XGBRegressor().get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [28]:
params = {'base_score': [0.5],
         'booster': ['gbtree'],
         'colsample_bylevel': [1],
         'colsample_bynode': [0.3,0.5],
         'colsample_bytree': [0.4,0.5],
         'gamma': [0.1, 0.01],
         'learning_rate': [0.05,0.04,0.03],
         'max_depth':[2,3,4,5,6, 7],
         'min_child_weight': [1],
         'n_estimators': [400,500,600, 800],
         'n_jobs': [-1],
         'reg_alpha': [0.1, 0.01],
         'reg_lambda': [0.1, 0.01],
         'scale_pos_weight': [1],
         'subsample': [0.7, 0.9]}

In [34]:
 
scoring = ["neg_mean_absolute_error"]
model = xgb.XGBRegressor()
grid_solver = GridSearchCV(estimator = model,
                   param_grid = params,
                   scoring = scoring,
                   cv = 5,
                   n_jobs=-1,
                   refit="neg_mean_absolute_error",
                   verbose = 2)

In [26]:
# SVR().get_params()

In [35]:
model_result = grid_solver.fit(X,y)

Fitting 5 folds for each of 4608 candidates, totalling 23040 fits
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2, min_child_weight=1, n_estimators=400, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, subsample=0.7; total time=   0.3s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2, min_child_weight=1, n_estimators=400, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.01, scale_pos_weight=1, subsample=0.7; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2, min_child_weight=1, n_estimators=400, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.01, scale_pos_weight=1, subsample=0.9; total time=   0.1s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0



[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2, min_child_weight=1, n_estimators=400, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, subsample=0.7; total time=   0.4s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2, min_child_weight=1, n_estimators=500, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, subsample=0.7; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2, min_child_weight=1, n_estimators=500, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, subsample=0.9; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.3, colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, max_depth=2

ValueError: 
All the 23040 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
23040 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/sklearn.py", line 1081, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ~~~~~~~~~~~~~~~~~~~~~~~~~^
        missing=self.missing,
        ^^^^^^^^^^^^^^^^^^^^^
    ...<14 lines>...
        feature_types=self.feature_types,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/sklearn.py", line 596, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
        data=X,
    ...<9 lines>...
        ref=None,
    )
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/sklearn.py", line 1003, in _create_dmatrix
    return QuantileDMatrix(
        **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
    )
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 1573, in __init__
    self._init(
    ~~~~~~~~~~^
        data,
        ^^^^^
    ...<11 lines>...
        enable_categorical=enable_categorical,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 1632, in _init
    it.reraise()
    ~~~~~~~~~~^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 569, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 550, in _handle_exception
    return fn()
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 637, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
                                          ~~~~~~~~~^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/data.py", line 1402, in next
    input_data(**self.kwargs)
    ~~~~~~~~~~^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/core.py", line 617, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ~~~~~~~~~~~~~~~~^
        data,
        ^^^^^
    ...<2 lines>...
        self._enable_categorical,
        ^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/data.py", line 1447, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ~~~~~~~~~~~~~~~~~~~~^
        data, enable_categorical, feature_names, feature_types
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/data.py", line 603, in _transform_pandas_df
    pandas_check_dtypes(data, enable_categorical)
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/data.py", line 569, in pandas_check_dtypes
    _invalid_dataframe_dtype(data)
    ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/data.py", line 356, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Podcast_Name: category, Genre: category, Publication_Day: category, Publication_Time: category, Episode_Sentiment: category


[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_depth=7, min_child_weight=1, n_estimators=800, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, subsample=0.7; total time=   0.0s

[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_depth=5, min_child_weight=1, n_estimators=800, n_jobs=-1, reg_alpha=0.01, reg_lambda=0.1, scale_pos_weight=1, subsample=0.7; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_depth=5, min_child_weight=1, n_estimators=800, n_jobs=-1, reg_alpha=0.01, reg_lambda=0.1, scale_pos_weight=1, subsample=0.9; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_

In [25]:

y_hat = model.predict(X)

In [26]:
root_mean_squared_error(y, y_hat)

np.float64(12.080740407209166)

In [28]:
# test submission
X_test = Podcast_Test_df.drop(columns = ['id'])

In [29]:
prediction = model.predict(X_test)

In [31]:
Podcast_Test_df['prediction']=prediction

In [32]:
Podcast_Test_df

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Episode_Number,prediction
0,750000,Educational Nuggets,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,73.0,53.324600
1,750001,Sound Waves,27.87,Music,71.29,Sunday,Morning,0.00,0.0,Neutral,23.0,17.179300
2,750002,Joke Junction,69.10,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,11.0,49.167881
3,750003,Comedy Corner,115.39,Comedy,23.40,Sunday,Morning,51.75,2.0,Positive,73.0,81.085724
4,750004,Life Lessons,72.32,Lifestyle,58.10,Wednesday,Morning,11.30,2.0,Neutral,50.0,47.390911
...,...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,Mind & Body,21.05,Health,65.77,Saturday,Evening,96.40,3.0,Negative,100.0,12.241870
249996,999996,Joke Junction,85.50,Comedy,41.47,Saturday,Night,30.52,2.0,Negative,85.0,58.668625
249997,999997,Joke Junction,12.11,Comedy,25.92,Thursday,Evening,73.69,1.0,Neutral,63.0,7.928202
249998,999998,Market Masters,113.46,Business,43.47,Friday,Night,93.59,3.0,Positive,46.0,76.799416


In [34]:
Submission = Podcast_Test_df[['id','prediction']]

In [35]:
Submission

Unnamed: 0,id,prediction
0,750000,53.324600
1,750001,17.179300
2,750002,49.167881
3,750003,81.085724
4,750004,47.390911
...,...,...
249995,999995,12.241870
249996,999996,58.668625
249997,999997,7.928202
249998,999998,76.799416


In [36]:
Submission.to_csv('Data/Submission.xgb.csv', index=False)