In [23]:
import pandas as pd
import numpy as np

import plotly.express as px

In [24]:
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [25]:
df_name = 'imdb_encoded_with_topics'
imdb_df = pd.read_csv(f'data/{df_name}.csv')

## Data Preparation

In [26]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director', 'action', 'adult',
       'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'film-noir', 'game-show', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
       'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
       'unknown', 'war', 'western', 'duration', 'imdb_rating', 'votes',
       'release_start', 'release_month', 'tv_series', 'title', 'synopsis',
       'actors', 'synopsis_lemmatized', 'topic'],
      dtype='object')

In [27]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161602 entries, 0 to 161601
Data columns (total 45 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   actor1               161602 non-null  int64  
 1   actor2               161602 non-null  int64  
 2   actor3               161602 non-null  int64  
 3   actor4               161602 non-null  int64  
 4   director             161602 non-null  object 
 5   action               161602 non-null  int64  
 6   adult                161602 non-null  int64  
 7   adventure            161602 non-null  int64  
 8   animation            161602 non-null  int64  
 9   biography            161602 non-null  int64  
 10  comedy               161602 non-null  int64  
 11  crime                161602 non-null  int64  
 12  documentary          161602 non-null  int64  
 13  drama                161602 non-null  int64  
 14  family               161602 non-null  int64  
 15  fantasy          

In [28]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director', 'action', 'adult',
       'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'film-noir', 'game-show', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
       'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
       'unknown', 'war', 'western', 'duration', 'imdb_rating', 'votes',
       'release_start', 'release_month', 'tv_series', 'title', 'synopsis',
       'actors', 'synopsis_lemmatized', 'topic'],
      dtype='object')

In [30]:
if df_name == 'imdb_encoded':
    cols_to_drop = ['title', 'synopsis', 'actors', 'director']
elif df_name == 'imdb_encoded_with_topics':
    cols_to_drop = ['title', 'synopsis', 'actors', 'director', 'synopsis_lemmatized']

model_df = imdb_df.drop(cols_to_drop, axis=1).dropna()

In [31]:
model_df.shape

(161596, 40)

In [32]:
model_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'action', 'adult', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'game-show', 'history', 'horror',
       'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance',
       'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war',
       'western', 'duration', 'imdb_rating', 'votes', 'release_start',
       'release_month', 'tv_series', 'topic'],
      dtype='object')

In [33]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [34]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')

In [35]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat'])

In [36]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [37]:
X.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'action', 'adult', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'game-show', 'history', 'horror',
       'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance',
       'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war',
       'western', 'duration', 'votes', 'release_start', 'release_month',
       'tv_series', 'topic'],
      dtype='object')

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
X_train.shape, X_test.shape

((129276, 39), (32320, 39))

## Model training

In [40]:
#XGB model with default params
xgb_model = XGBClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
print(xgb_model.score(X_test, y_test))
print(roc_auc_score(y_test, xgb_model.predict_proba(X_test), multi_class = 'ovr'))

0.4198019801980198
0.7985763247949552


In [41]:
# A parameter grid for XGBoost
param_grid = {
        'n_estimators':[100, 200, 300, 500],
        'max_depth': [3, 4, 5]
        }

In [42]:
# Grid search with CV
xgb = XGBClassifier(objective = 'multi:softproba', random_state=42)
search = GridSearchCV(xgb, param_grid, cv=5, scoring='roc_auc_ovr', verbose=3, n_jobs=-1)
search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                  

In [43]:
search.score(X_test, y_test)

0.7990722178302374

In [44]:
search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=42, reg_alpha=0, ...)

In [45]:
search.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__objective': 'multi:softproba',
 'estimator__use_label_encoder': False,
 'estimator__base_score': None,
 'estimator__booster': None,
 'estimator__callbacks': None,
 'estimator__colsample_bylevel': None,
 'estimator__colsample_bynode': None,
 'estimator__colsample_bytree': None,
 'estimator__early_stopping_rounds': None,
 'estimator__enable_categorical': False,
 'estimator__eval_metric': None,
 'estimator__gamma': None,
 'estimator__gpu_id': None,
 'estimator__grow_policy': None,
 'estimator__importance_type': None,
 'estimator__interaction_constraints': None,
 'estimator__learning_rate': None,
 'estimator__max_bin': None,
 'estimator__max_cat_to_onehot': None,
 'estimator__max_delta_step': None,
 'estimator__max_depth': None,
 'estimator__max_leaves': None,
 'estimator__min_child_weight': None,
 'estimator__missing': nan,
 'estimator__monotone_constraints': None,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__num_par