In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/imdb

/content/drive/.shortcut-targets-by-id/1SiIa67aNIcyTlPf8uVRMLga9M1m-O1L1/imdb


In [3]:
import pandas as pd
import numpy as np

import plotly.express as px

In [4]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

In [5]:
import functions

In [6]:
imdb_df = pd.read_csv('imdb_encoded.csv')

## Data Preparation

In [7]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director', 'action', 'adult',
       'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'film-noir', 'game-show', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
       'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
       'unknown', 'war', 'western', 'duration', 'imdb_rating', 'votes',
       'release_start', 'release_month', 'tv_series', 'title', 'synopsis',
       'actors'],
      dtype='object')

In [8]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183967 entries, 0 to 183966
Data columns (total 43 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   actor1         183967 non-null  int64  
 1   actor2         183967 non-null  int64  
 2   actor3         183967 non-null  int64  
 3   actor4         183967 non-null  int64  
 4   director       183967 non-null  object 
 5   action         183967 non-null  int64  
 6   adult          183967 non-null  int64  
 7   adventure      183967 non-null  int64  
 8   animation      183967 non-null  int64  
 9   biography      183967 non-null  int64  
 10  comedy         183967 non-null  int64  
 11  crime          183967 non-null  int64  
 12  documentary    183967 non-null  int64  
 13  drama          183967 non-null  int64  
 14  family         183967 non-null  int64  
 15  fantasy        183967 non-null  int64  
 16  film-noir      183967 non-null  int64  
 17  game-show      183967 non-nul

In [9]:
model_df = imdb_df.drop(['title', 'synopsis', 'actors', 'director'], axis=1).dropna()

In [10]:
model_df.shape

(183959, 39)

In [11]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [12]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')

In [13]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat'])

In [14]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape, X_test.shape

((147167, 38), (36792, 38))

## Model training

In [17]:
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
model.score(X_test, y_test)

KeyboardInterrupt: ignored

In [None]:
print(classification_report(y_test, model.predict(X_test)))


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00       103
           2       0.00      0.00      0.00       583
           3       0.00      0.00      0.00      1753
           4       0.00      0.00      0.00      4151
           5       0.25      0.01      0.01      8290
           6       0.32      0.98      0.48     11662
           7       0.42      0.02      0.04      7903
           8       0.86      0.01      0.02      2115
           9       0.00      0.00      0.00       230

    accuracy                           0.32     36792
   macro avg       0.18      0.10      0.05     36792
weighted avg       0.30      0.32      0.16     36792




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class = 'ovr')

0.5587911663155726

In [None]:
xgb_model = XGBClassifier(max_depth = 5).fit(X_train, y_train)

In [None]:
xgb_model.score(X_test, y_test)

0.4010654490106545

In [None]:
roc_auc_score(y_test, xgb_model.predict_proba(X_test), multi_class = 'ovr')

0.7523888720620393

## Random Forest with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators
              #  'max_features': max_features,
              #  'max_depth': max_depth,
              #  'min_samples_split': min_samples_split,
              #  'min_samples_leaf': min_samples_leaf,
              #  'bootstrap': bootstrap}
              }
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, scoring = 'roc_auc', cv = 5, verbose=5, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.

exception calling callback for <Future at 0x7fe5b1c82cd0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.7/dist-packages/

TerminatedWorkerError: ignored

## Naive Bayes on Synopsis

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import remove_stopwords
import re

In [None]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'action', 'adult', 'adventure', 'animation',
       'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
       'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
       'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
       'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
       'tv_series'],
      dtype='object')

In [None]:
text_df = imdb_df[['imdb_rating', 'synopsis']].dropna()
# Revome add plot
text_df = text_df[text_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)
# Remove punctuation
text_df['synopsis'] = text_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Convert the titles to lowercase
text_df['synopsis'] = text_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
text_df['synopsis'] = text_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Print out the first rows of papers
text_df['synopsis'] = text_df['synopsis'].str.strip()
# Remove stopwords
text_df['synopsis'] = [remove_stopwords(i) for i in text_df['synopsis']]

In [None]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79840 entries, 0 to 79839
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdb_rating  79840 non-null  float64
 1   synopsis     79840 non-null  object 
dtypes: float64(1), object(1)
memory usage: 1.2+ MB


In [None]:
np.unique(np.array([str(type(i)) for i in text_df['synopsis']]))

array(["<class 'str'>"], dtype='<U13')

In [None]:
text_df['imdb_rating_cat'] = pd.cut(text_df['imdb_rating'], bins=[0, 2, 4, 5, 6, 7, 8, 10], right=True, labels=['0-2', '2-4', '4-5', '5-6', '6-7', '7-8', '8-10']) 

In [None]:
text_df.imdb_rating_cat.value_counts()

6-7     25714
7-8     18157
5-6     17268
4-5      8563
8-10     5413
2-4      4514
0-2       211
Name: imdb_rating_cat, dtype: int64

In [None]:
text_df_tmp = text_df.iloc[0:70000]

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
# Train the model using the training data
model.fit(text_df['synopsis'], text_df['imdb_rating_cat'])
# Predict the categories of the test data
predicted_categories = model.predict(text_df['synopsis'])

In [None]:
model.score(text_df['synopsis'], text_df['imdb_rating_cat'])

0.4258892785571142