# Initial direct model (xgb) GridSearchCV

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import GridSearchCV 
from sklearn import preprocessing 
import sklearn as sk
from sklearn.svm import SVR
import xgboost as xgb

In [2]:
# Load DataFrames
Podcast_Train_df = pd.read_csv('Data/train.csv')
Podcast_Test_df = pd.read_csv('Data/test.csv')

In [3]:
# to make the preprocessing at the same time
Podcast_Train_df['is_train'] = 1
Podcast_Test_df['is_train'] = 0

In [4]:
Podcast_df = pd.concat([Podcast_Train_df,Podcast_Test_df])

### The features 
* 'Episode_Length_minutes' has NaN values (average this time)
* 'Guest_Popularity_percentage' has NaN values "no guest" (=0 this time)
* 'Number_of_Ads' has one NaN, but also error values. (goes only from 0 to 3 'int'. NaN by 0, missing values by 1)
* xgb accepts categorical features. 

In [5]:
Podcast_df["Number_of_Ads"] = Podcast_df["Number_of_Ads"].fillna(0)

In [6]:
# Define the list of correct values
correct_values = [0.00, 1.00, 2.00, 3.00]

# Create a boolean mask to identify erroneous values
# Values that are NOT in the correct list 
erroneous_mask = ~Podcast_df["Number_of_Ads"].isin(correct_values) 
Podcast_df.loc[erroneous_mask, "Number_of_Ads"] = 1.00


In [7]:
# Change episode number to a number
Podcast_df["Episode_Number"] = Podcast_df["Episode_Title"].str.extract(r"(\d+)").astype(float)
Podcast_df['Guest_Popularity_percentage'] = Podcast_df['Guest_Popularity_percentage'].fillna(0) 

In [8]:
Podcast_df["Episode_Length_minutes"] = Podcast_df["Episode_Length_minutes"].fillna(Podcast_df["Episode_Length_minutes"].median())

In [9]:
Podcast_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 14 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1000000 non-null  int64  
 1   Podcast_Name                 1000000 non-null  object 
 2   Episode_Title                1000000 non-null  object 
 3   Episode_Length_minutes       1000000 non-null  float64
 4   Genre                        1000000 non-null  object 
 5   Host_Popularity_percentage   1000000 non-null  float64
 6   Publication_Day              1000000 non-null  object 
 7   Publication_Time             1000000 non-null  object 
 8   Guest_Popularity_percentage  1000000 non-null  float64
 9   Number_of_Ads                1000000 non-null  float64
 10  Episode_Sentiment            1000000 non-null  object 
 11  Listening_Time_minutes       750000 non-null   float64
 12  is_train                     1000000 non-null  i

In [10]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_df[col] = Podcast_df[col].astype('category')

In [11]:
# Drop unused
Podcast_df.drop(columns=["Episode_Title"], inplace=True)

In [35]:
Podcast_df

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,is_train,Episode_Number
0,0,Mystery Matters,63.87,True Crime,74.81,Thursday,Night,0.00,0.0,Positive,31.41998,1,98.0
1,1,Joke Junction,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,1,26.0
2,2,Study Sessions,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,1,16.0
3,3,Digital Digest,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive,46.27824,1,45.0
4,4,Mind & Body,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031,1,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,Mind & Body,21.05,Health,65.77,Saturday,Evening,96.40,3.0,Negative,,0,100.0
249996,999996,Joke Junction,85.50,Comedy,41.47,Saturday,Night,30.52,2.0,Negative,,0,85.0
249997,999997,Joke Junction,12.11,Comedy,25.92,Thursday,Evening,73.69,1.0,Neutral,,0,63.0
249998,999998,Market Masters,113.46,Business,43.47,Friday,Night,93.59,3.0,Positive,,0,46.0


In [12]:
# Split back Podcast_df
Podcast_Train_df = Podcast_df[Podcast_df['is_train']== 1].drop(columns=["is_train"])
Podcast_Test_df = Podcast_df[Podcast_df['is_train']== 0].drop(columns=["is_train", "Listening_Time_minutes"])

In [13]:
# Model introduction.
X = Podcast_Train_df.drop(columns=["Listening_Time_minutes", "id"])
y = Podcast_Train_df["Listening_Time_minutes"]

In [27]:
model = xgb.XGBRegressor()

In [28]:
model.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [32]:
params = {'base_score': [0.5],
          'enable_categorical': True,
         'booster': ['gbtree'],
         'colsample_bylevel': [1],
         'colsample_bynode': [0.3,0.5],
         'colsample_bytree': [0.4,0.5],
         'gamma': [0.1, 0.01],
         'learning_rate': [0.05,0.04,0.03],
         'max_depth':[2,3,4,5,6, 7],
         'min_child_weight': [1],
         'n_estimators': [400,500,600, 800],
         'n_jobs': [-1],
         'reg_alpha': [0.1, 0.01],
         'reg_lambda': [0.1, 0.01],
         'scale_pos_weight': [1],
         'subsample': [0.7, 0.9]}

In [33]:
scoring = ["neg_mean_absolute_error"]
grid_solver = GridSearchCV(estimator = model,
                   param_grid = params,
                   scoring = scoring,
                   cv = 5,
                   n_jobs=-1,
                   refit="neg_mean_absolute_error",
                   verbose = 2)

In [None]:
# SVR().get_params()

In [34]:
model_result = grid_solver.fit(X,y)

TypeError: Parameter grid for parameter 'enable_categorical' needs to be a list or a numpy array, but got True (of type bool) instead. Single values need to be wrapped in a list with one element.

[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_depth=4, min_child_weight=1, n_estimators=500, n_jobs=-1, reg_alpha=0.01, reg_lambda=0.1, scale_pos_weight=1, subsample=0.9; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_depth=4, min_child_weight=1, n_estimators=500, n_jobs=-1, reg_alpha=0.01, reg_lambda=0.1, scale_pos_weight=1, subsample=0.9; total time=   0.1s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_depth=4, min_child_weight=1, n_estimators=800, n_jobs=-1, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, subsample=0.7; total time=   0.0s
[CV] END base_score=0.5, booster=gbtree, colsample_bylevel=1, colsample_bynode=0.5, colsample_bytree=0.5, gamma=0.01, learning_rate=0.03, max_d

In [31]:
model_result.best_estimator_


NameError: name 'model_result' is not defined

In [19]:
grid_solver.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:

y_hat = model.predict(X)

In [None]:
root_mean_squared_error(y, y_hat)

In [None]:
# test submission
X_test = Podcast_Test_df.drop(columns = ['id'])

In [None]:
prediction = model.predict(X_test)

In [None]:
Podcast_Test_df['prediction']=prediction

In [None]:
Podcast_Test_df

In [None]:
Submission = Podcast_Test_df[['id','prediction']]

In [None]:
Submission

In [None]:
Submission.to_csv('Data/Submission.xgb.csv', index=False)