In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# %cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb

In [25]:
import pandas as pd
import numpy as np

import plotly.express as px

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report
from catboost import Pool, CatBoostClassifier, cv

In [5]:
import functions

In [29]:
imdb_df = pd.read_csv('data/imdb_encoded.csv')

## Data Preparation

In [30]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors'],
      dtype='object')

In [31]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183967 entries, 0 to 183966
Data columns (total 46 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   actor1         183967 non-null  int64  
 1   actor2         183967 non-null  int64  
 2   actor3         183967 non-null  int64  
 3   actor4         183967 non-null  int64  
 4   director_enc   183967 non-null  int64  
 5   action         183967 non-null  int64  
 6   adult          183967 non-null  int64  
 7   adventure      183967 non-null  int64  
 8   animation      183967 non-null  int64  
 9   biography      183967 non-null  int64  
 10  comedy         183967 non-null  int64  
 11  crime          183967 non-null  int64  
 12  documentary    183967 non-null  int64  
 13  drama          183967 non-null  int64  
 14  family         183967 non-null  int64  
 15  fantasy        183967 non-null  int64  
 16  film-noir      183967 non-null  int64  
 17  game-show      183967 non-nul

In [32]:
model_df = imdb_df.dropna()

In [33]:
model_df.shape

(181302, 46)

In [34]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [35]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Cleaning synopsis

In [15]:
# deleting rows with no synopsis
model_df = model_df[model_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [16]:

# Removing punctuation
model_df['synopsis'] = model_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
model_df['synopsis'] = model_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
model_df['synopsis'] = model_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
model_df['synopsis'] = model_df['synopsis'].str.strip()

## Model training

In [40]:
dep_var = f'imdb_rating_cat'
indep_vars = ['duration', 'votes', 'release_start', 'release_month',
                'tv_series', 'title', 'director', 'actors', 'genre']

In [41]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [42]:
X.columns

Index(['duration', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'director', 'actors', 'genre'],
      dtype='object')

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
X_train.shape, X_test.shape

((145041, 9), (36261, 9))

In [63]:
cat_features = ['actors', 'director', 'genre']
text_features = ['title']

train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features,
                     text_features=text_features)

eval_dataset = Pool(data=X_test,
                    label=y_test,
                    cat_features=cat_features,
                    text_features=text_features)

params = {"iterations": 10,
          "depth": 2,
          "loss_function": "MultiClass",
          "custom_loss": "AUC",
          "verbose": True}

scores = cv(train_dataset,
            params,
            fold_count=2, 
            plot="True")

# # Initialize CatBoostClassifier
# model = CatBoostClassifier(iterations=100,
#                            learning_rate=0.01,
#                            depth=5,
#                            eval_metric='AUC',
#                            loss_function='MultiClass',
#                            random_seed=42)
# # Fit model
# model.fit(train_dataset)
# # Get predicted classes
# preds_class = model.predict(eval_dataset)
# # Get predicted probabilities for each class
# preds_proba = model.predict_proba(eval_dataset)
# # Get predicted RawFormulaVal
# preds_raw = model.predict(eval_dataset,
#                           prediction_type='RawFormulaVal')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/2]
0:	learn: 2.2610112	test: 2.2609218	best: 2.2609218 (0)	total: 424ms	remaining: 3.82s
1:	learn: 2.2230687	test: 2.2227844	best: 2.2227844 (1)	total: 923ms	remaining: 3.69s
2:	learn: 2.1889433	test: 2.1883935	best: 2.1883935 (2)	total: 1.32s	remaining: 3.09s
3:	learn: 2.1592708	test: 2.1586484	best: 2.1586484 (3)	total: 1.75s	remaining: 2.63s
4:	learn: 2.1300924	test: 2.1293412	best: 2.1293412 (4)	total: 2.14s	remaining: 2.14s
5:	learn: 2.1037426	test: 2.1027941	best: 2.1027941 (5)	total: 2.58s	remaining: 1.72s
6:	learn: 2.0790174	test: 2.0779415	best: 2.0779415 (6)	total: 2.98s	remaining: 1.28s
7:	learn: 2.0551869	test: 2.0540047	best: 2.0540047 (7)	total: 3.42s	remaining: 855ms
8:	learn: 2.0326547	test: 2.0313160	best: 2.0313160 (8)	total: 3.81s	remaining: 424ms
9:	learn: 2.0121139	test: 2.0104675	best: 2.0104675 (9)	total: 4.2s	remaining: 0us

bestTest = 2.010467485
bestIteration = 9

Training on fold [1/2]
0:	learn: 2.2609094	test: 2.2607949	best: 2.2607949 (0

In [64]:
scores

Unnamed: 0,iterations,test-MultiClass-mean,test-MultiClass-std,train-MultiClass-mean,train-MultiClass-std,test-AUC-mean,test-AUC-std
0,0,2.260858,9e-05,2.26096,7.2e-05,,
1,1,2.222718,9.4e-05,2.223011,8.2e-05,,
2,2,2.188114,0.000395,2.188578,0.000516,,
3,3,2.158414,0.000332,2.158967,0.000429,,
4,4,2.129036,0.000431,2.129778,0.000444,,
5,5,2.102495,0.000423,2.103446,0.00042,,
6,6,2.077664,0.000392,2.078797,0.000312,,
7,7,2.053989,2.3e-05,2.055299,0.000159,,
8,8,2.031375,8.4e-05,2.032831,0.000249,,
9,9,2.010756,0.000408,2.01242,0.000433,,


In [None]:
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics

eval_pool = Pool(X_validation, y_validation, cat_features=cat_features)
curve = get_roc_curve(model, eval_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

In [None]:
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve

(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
lw = 2

plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [57]:
model.score(X_test, y_test)

0.3754722704834395

In [58]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class = 'ovr')

0.658229476481567