In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# %cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb

In [3]:
import pandas as pd
import numpy as np

import plotly.express as px

In [4]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

from sklearn.model_selection import RandomizedSearchCV

In [5]:
import functions

In [7]:
imdb_df = pd.read_csv('data/imdb_not_encoded.csv')

## Data Preparation

In [8]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'release_month', 'tv_series'],
      dtype='object')

In [9]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183967 entries, 0 to 183966
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   actors            183967 non-null  object 
 1   director          183967 non-null  object 
 2   duration          183967 non-null  float64
 3   genre             181315 non-null  object 
 4   imdb_rating       183967 non-null  float64
 5   link              183967 non-null  object 
 6   synopsis          183962 non-null  object 
 7   title             183966 non-null  object 
 8   votes             183967 non-null  float64
 9   page_url          183967 non-null  object 
 10  page_url_cleaned  183967 non-null  object 
 11  release_start     183959 non-null  float64
 12  release_month     183967 non-null  int64  
 13  tv_series         183967 non-null  int64  
dtypes: float64(4), int64(2), object(8)
memory usage: 19.6+ MB


In [10]:
model_df = imdb_df.dropna()

In [11]:
model_df.shape

(181302, 14)

In [12]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [13]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Cleaning synopsis

In [26]:
import re
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

from catboost import Pool, CatBoostClassifier, cv

In [15]:
# deleting rows with no synopsis
model_df = model_df[model_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [16]:

# Removing punctuation
model_df['synopsis'] = model_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
model_df['synopsis'] = model_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
model_df['synopsis'] = model_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
model_df['synopsis'] = model_df['synopsis'].str.strip()

## Model training

In [27]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat', 'page_url', 'page_url_cleaned', 'link'])

In [28]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [29]:
X.columns

Index(['actors', 'director', 'duration', 'genre', 'synopsis', 'title', 'votes',
       'release_start', 'release_month', 'tv_series'],
      dtype='object')

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_train.shape, X_test.shape

((128339, 10), (32085, 10))

In [40]:
cat_features = ['actors', 'director', 'genre']
text_features = ['synopsis', 'title']

train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features,
                     text_features=text_features)

eval_dataset = Pool(data=X_test,
                    label=y_test,
                    cat_features=cat_features,
                    text_features=text_features)

params = {"iterations": 100,
          "depth": 2,
          "loss_function": "MultiClass",
          "eval_metric": "AUC",
          "verbose": True}

scores = cv(cv_dataset,
            params,
            fold_count=2, 
            plot="True")

# # Initialize CatBoostClassifier
# model = CatBoostClassifier(iterations=10,
#                            learning_rate=0.1,
#                            depth=5,
#                            eval_metric='AUC',
#                            loss_function='MultiClass',
#                            random_seed=42)
# # Fit model
# model.fit(train_dataset)
# # Get predicted classes
# preds_class = model.predict(eval_dataset)
# # Get predicted probabilities for each class
# preds_proba = model.predict_proba(eval_dataset)
# # Get predicted RawFormulaVal
# preds_raw = model.predict(eval_dataset,
#                           prediction_type='RawFormulaVal')

CatBoostError: Invalid loss_function='AUC': for classifier use Logloss, CrossEntropy, MultiClass, MultiClassOneVsAll or custom objective object

In [37]:
model.score(X_test, y_test)

0.37678042699080566

In [38]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class = 'ovr')

0.7314974857651076