In [72]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
%cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb

/content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb


In [74]:
import pandas as pd
import numpy as np

import plotly.express as px

In [75]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

from sklearn.model_selection import RandomizedSearchCV

In [76]:
import functions

In [77]:
imdb_df = pd.read_csv('imdb_with_topics.csv')

## Data Preparation

In [78]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director', 'action', 'adult',
       'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'film-noir', 'game-show', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
       'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
       'unknown', 'war', 'western', 'duration', 'imdb_rating', 'votes',
       'release_start', 'release_month', 'tv_series', 'title', 'synopsis',
       'actors', 'synopsis_lemmatized', 'topic'],
      dtype='object')

In [79]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161602 entries, 0 to 161601
Data columns (total 45 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   actor1               161602 non-null  int64  
 1   actor2               161602 non-null  int64  
 2   actor3               161602 non-null  int64  
 3   actor4               161602 non-null  int64  
 4   director             161602 non-null  object 
 5   action               161602 non-null  int64  
 6   adult                161602 non-null  int64  
 7   adventure            161602 non-null  int64  
 8   animation            161602 non-null  int64  
 9   biography            161602 non-null  int64  
 10  comedy               161602 non-null  int64  
 11  crime                161602 non-null  int64  
 12  documentary          161602 non-null  int64  
 13  drama                161602 non-null  int64  
 14  family               161602 non-null  int64  
 15  fantasy          

In [80]:
model_df = imdb_df.dropna()

In [81]:
model_df.shape

(161593, 45)

In [82]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [83]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Cleaning synopsis

In [84]:
import re
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

In [85]:
# deleting rows with no synopsis
model_df = model_df[model_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [86]:

# Removing punctuation
model_df['synopsis'] = model_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
model_df['synopsis'] = model_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
model_df['synopsis'] = model_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
model_df['synopsis'] = model_df['synopsis'].str.strip()

## Model training

In [96]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat', 'synopsis_lemmatized'])

In [97]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
X_train.shape, X_test.shape

((129274, 43), (32319, 43))

In [100]:
# !pip install catboost

In [101]:
X.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director', 'action', 'adult',
       'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'film-noir', 'game-show', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv',
       'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller',
       'unknown', 'war', 'western', 'duration', 'votes', 'release_start',
       'release_month', 'tv_series', 'title', 'synopsis', 'actors', 'topic'],
      dtype='object')

In [103]:
from catboost import Pool, CatBoostClassifier

cat_features = ['actors', 'director']
text_features = ['synopsis', 'title']

train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features,
                     text_features=text_features)

eval_dataset = Pool(data=X_test,
                    label=y_test,
                    cat_features=cat_features,
                    text_features=text_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100,
                           learning_rate=0.1,
                           depth=5,
                           eval_metric='AUC')
# Fit model
model.fit(train_dataset)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset,
                          prediction_type='RawFormulaVal')

0:	total: 8.96s	remaining: 14m 47s
1:	total: 16.3s	remaining: 13m 17s
2:	total: 24.3s	remaining: 13m 6s
3:	total: 32.5s	remaining: 12m 59s
4:	total: 41.5s	remaining: 13m 9s
5:	total: 49.1s	remaining: 12m 49s
6:	total: 56.5s	remaining: 12m 30s
7:	total: 1m 5s	remaining: 12m 36s
8:	total: 1m 14s	remaining: 12m 34s
9:	total: 1m 22s	remaining: 12m 20s
10:	total: 1m 28s	remaining: 11m 59s
11:	total: 1m 36s	remaining: 11m 48s
12:	total: 1m 45s	remaining: 11m 43s
13:	total: 1m 55s	remaining: 11m 48s
14:	total: 2m 3s	remaining: 11m 41s
15:	total: 2m 12s	remaining: 11m 34s
16:	total: 2m 22s	remaining: 11m 33s
17:	total: 2m 30s	remaining: 11m 26s
18:	total: 2m 39s	remaining: 11m 19s
19:	total: 2m 47s	remaining: 11m 10s
20:	total: 2m 55s	remaining: 11m 1s
21:	total: 3m 3s	remaining: 10m 51s
22:	total: 3m 14s	remaining: 10m 50s
23:	total: 3m 22s	remaining: 10m 41s
24:	total: 3m 30s	remaining: 10m 32s
25:	total: 3m 40s	remaining: 10m 26s
26:	total: 3m 47s	remaining: 10m 15s
27:	total: 3m 55s	remain

In [104]:
model.score(X_test, y_test)

0.42612704601008694

In [105]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class = 'ovr')

0.8177981696132852