In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/imdb

/content/drive/.shortcut-targets-by-id/1SiIa67aNIcyTlPf8uVRMLga9M1m-O1L1/imdb


In [3]:
import pandas as pd
import numpy as np

import plotly.express as px

In [4]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

from sklearn.model_selection import RandomizedSearchCV

In [5]:
import functions

In [6]:
imdb_df = pd.read_csv('imdb_encoded.csv')

## Data Preparation

In [7]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director', 'action', 'adult',
       'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'film-noir', 'game-show', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
       'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
       'unknown', 'war', 'western', 'duration', 'imdb_rating', 'votes',
       'release_start', 'release_month', 'tv_series', 'title', 'synopsis',
       'actors'],
      dtype='object')

In [8]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183967 entries, 0 to 183966
Data columns (total 43 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   actor1         183967 non-null  int64  
 1   actor2         183967 non-null  int64  
 2   actor3         183967 non-null  int64  
 3   actor4         183967 non-null  int64  
 4   director       183967 non-null  object 
 5   action         183967 non-null  int64  
 6   adult          183967 non-null  int64  
 7   adventure      183967 non-null  int64  
 8   animation      183967 non-null  int64  
 9   biography      183967 non-null  int64  
 10  comedy         183967 non-null  int64  
 11  crime          183967 non-null  int64  
 12  documentary    183967 non-null  int64  
 13  drama          183967 non-null  int64  
 14  family         183967 non-null  int64  
 15  fantasy        183967 non-null  int64  
 16  film-noir      183967 non-null  int64  
 17  game-show      183967 non-nul

In [9]:
model_df = imdb_df.drop(['title', 'synopsis', 'actors', 'director'], axis=1).dropna()

In [10]:
model_df.shape

(183959, 39)

In [11]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [12]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')

In [13]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat'])

In [14]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape, X_test.shape

((147167, 38), (36792, 38))

## Model training

In [17]:
# Number of trees in xgb
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Create the random grid
random_grid = {'n_estimators': n_estimators,
              #  'max_features': max_features,
               'max_depth': max_depth}
random_grid

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
xgb = XGBClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 10, scoring = 'roc_auc', cv = 5, verbose=3, random_state=42, n_jobs = -1)
# Fit the random search model
xgb_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
