In [8]:
import pandas as pd
import numpy as np

import plotly.express as px

In [9]:
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [18]:
df_name = 'imdb_encoded'
imdb_df = pd.read_csv(f'data/{df_name}.csv')

## Data Preparation

In [19]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'duration', 'imdb_rating',
       'votes', 'release_start', 'release_month', 'tv_series', 'title',
       'synopsis', 'director', 'actors'],
      dtype='object')

In [20]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183967 entries, 0 to 183966
Data columns (total 44 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   actor1         183967 non-null  int64  
 1   actor2         183967 non-null  int64  
 2   actor3         183967 non-null  int64  
 3   actor4         183967 non-null  int64  
 4   director_enc   183967 non-null  int64  
 5   action         183967 non-null  int64  
 6   adult          183967 non-null  int64  
 7   adventure      183967 non-null  int64  
 8   animation      183967 non-null  int64  
 9   biography      183967 non-null  int64  
 10  comedy         183967 non-null  int64  
 11  crime          183967 non-null  int64  
 12  documentary    183967 non-null  int64  
 13  drama          183967 non-null  int64  
 14  family         183967 non-null  int64  
 15  fantasy        183967 non-null  int64  
 16  film-noir      183967 non-null  int64  
 17  game-show      183967 non-nul

In [21]:
if df_name == 'imdb_encoded':
    cols_to_drop = ['title', 'synopsis', 'actors', 'director']

model_df = imdb_df.drop(cols_to_drop, axis=1).dropna()

In [22]:
model_df.shape

(183959, 40)

In [24]:
model_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'duration', 'imdb_rating',
       'votes', 'release_start', 'release_month', 'tv_series'],
      dtype='object')

In [25]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [26]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')

In [27]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat'])

In [28]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [29]:
X.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'duration', 'votes',
       'release_start', 'release_month', 'tv_series'],
      dtype='object')

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_train.shape, X_test.shape

((147167, 39), (36792, 39))

## Model training

In [33]:
#XGB model with default params
xgb_model = XGBClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
print(xgb_model.score(X_test, y_test))
print(roc_auc_score(y_test, xgb_model.predict_proba(X_test), multi_class = 'ovr'))

0.40769732550554466
0.7936751724963026


In [44]:
# A parameter grid for XGBoost
param_grid = {
        'n_estimators':[100, 200, 300, 500],
        'max_depth': [3, 4, 5]
        }

In [46]:
xgb = XGBClassifier(objective = 'multi:softproba', random_state=42)
search = GridSearchCV(xgb, param_grid, cv=5, scoring='roc_auc_ovr', verbose=3)
search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END .....max_depth=3, n_estimators=100;, score=0.784 total time=  52.2s
[CV 2/5] END .....max_depth=3, n_estimators=100;, score=0.778 total time=  49.3s
[CV 3/5] END .....max_depth=3, n_estimators=100;, score=0.777 total time=  50.3s
[CV 4/5] END .....max_depth=3, n_estimators=100;, score=0.790 total time=  56.1s
[CV 5/5] END .....max_depth=3, n_estimators=100;, score=0.782 total time=  59.2s
[CV 1/5] END .....max_depth=3, n_estimators=200;, score=0.788 total time= 1.8min
[CV 2/5] END .....max_depth=3, n_estimators=200;, score=0.782 total time= 1.8min
[CV 3/5] END .....max_depth=3, n_estimators=200;, score=0.780 total time= 1.8min
[CV 4/5] END .....max_depth=3, n_estimators=200;, score=0.795 total time= 1.8min
[CV 5/5] END .....max_depth=3, n_estimators=200;, score=0.792 total time= 1.8min
[CV 1/5] END .....max_depth=3, n_estimators=300;, score=0.789 total time= 2.7min
[CV 2/5] END .....max_depth=3, n_estimators=300;

In [68]:
search.score(X_test, y_test)

0.7985763194655714

In [51]:
search.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     mi