In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/training

/content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/training


In [4]:
import pandas as pd
import numpy as np

import plotly.express as px

In [5]:
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [7]:
df_name = 'imdb_encoded'
imdb_df = pd.read_csv(f'data/{df_name}.csv')

## Data Preparation

In [8]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'duration', 'imdb_rating',
       'votes', 'release_start', 'release_month', 'tv_series', 'title',
       'synopsis', 'director', 'actors'],
      dtype='object')

In [9]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183967 entries, 0 to 183966
Data columns (total 44 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   actor1         183967 non-null  int64  
 1   actor2         183967 non-null  int64  
 2   actor3         183967 non-null  int64  
 3   actor4         183967 non-null  int64  
 4   director_enc   183967 non-null  int64  
 5   action         183967 non-null  int64  
 6   adult          183967 non-null  int64  
 7   adventure      183967 non-null  int64  
 8   animation      183967 non-null  int64  
 9   biography      183967 non-null  int64  
 10  comedy         183967 non-null  int64  
 11  crime          183967 non-null  int64  
 12  documentary    183967 non-null  int64  
 13  drama          183967 non-null  int64  
 14  family         183967 non-null  int64  
 15  fantasy        183967 non-null  int64  
 16  film-noir      183967 non-null  int64  
 17  game-show      183967 non-nul

In [10]:
if df_name == 'imdb_encoded':
    cols_to_drop = ['title', 'synopsis', 'actors', 'director']

model_df = imdb_df.drop(cols_to_drop, axis=1).dropna()

In [11]:
model_df.shape

(183959, 40)

In [12]:
model_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'duration', 'imdb_rating',
       'votes', 'release_start', 'release_month', 'tv_series'],
      dtype='object')

In [13]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [14]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')

In [15]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat'])

In [16]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [17]:
X.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'duration', 'votes',
       'release_start', 'release_month', 'tv_series'],
      dtype='object')

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train.shape, X_test.shape

((147167, 39), (36792, 39))

## Model training

In [20]:
#XGB model with default params
xgb_model = XGBClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
print(xgb_model.score(X_test, y_test))
print(roc_auc_score(y_test, xgb_model.predict_proba(X_test), multi_class = 'ovr'))

0.39062839747771255
0.755643469183482


In [22]:
# A parameter grid for XGBoost
param_grid = {
        'n_estimators':[100, 200, 300, 500],
        'max_depth': [3, 4, 5]
        }

In [None]:
# Grid search with CV
xgb = XGBClassifier(objective = 'multi:softproba', random_state=42)
search = GridSearchCV(xgb, param_grid, cv=5, scoring='roc_auc_ovr', verbose=3, n_jobs=-1)
search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
search.score(X_test, y_test)

In [None]:
search.get_params()