In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report
from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import GridSearchCV

In [3]:
imdb_df = pd.read_csv('data/imdb_encoded_with_topics.csv')

## Data Preparation

In [4]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors', 'synopsis_lemmatized',
       'topic'],
      dtype='object')

In [5]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161602 entries, 0 to 161601
Data columns (total 48 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   actor1               161602 non-null  int64  
 1   actor2               161602 non-null  int64  
 2   actor3               161602 non-null  int64  
 3   actor4               161602 non-null  int64  
 4   director_enc         161602 non-null  int64  
 5   action               161602 non-null  int64  
 6   adult                161602 non-null  int64  
 7   adventure            161602 non-null  int64  
 8   animation            161602 non-null  int64  
 9   biography            161602 non-null  int64  
 10  comedy               161602 non-null  int64  
 11  crime                161602 non-null  int64  
 12  documentary          161602 non-null  int64  
 13  drama                161602 non-null  int64  
 14  family               161602 non-null  int64  
 15  fantasy          

In [6]:
model_df = imdb_df.dropna()

In [7]:
model_df.shape

(160422, 48)

In [8]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [9]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Model training

In [10]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors', 'synopsis_lemmatized',
       'topic'],
      dtype='object')

In [11]:
dep_var = f'imdb_rating_cat'
indep_vars = ['genre', 'duration', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'director', 'actors', 'synopsis_lemmatized']

In [12]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [13]:
X.columns

Index(['genre', 'duration', 'votes', 'release_start', 'release_month',
       'tv_series', 'title', 'director', 'actors', 'synopsis_lemmatized'],
      dtype='object')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.shape, X_test.shape

((128337, 10), (32085, 10))

In [26]:
cat_features = ['actors', 'director', 'genre']
text_features = ['title', 'synopsis_lemmatized']

train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features,
                     text_features=text_features)

eval_dataset = Pool(data=X_test,
                    label=y_test,
                    cat_features=cat_features,
                    text_features=text_features)

In [18]:
params = {
    'loss_function': 'MultiClass',
    'iterations': 10,
    'custom_loss': 'AUC',
    'random_seed': 42,
    'learning_rate': 0.5
}

cv_data = cv(
    params=params,
    pool=train_dataset,
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    stratified=True, 
    verbose=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 1.7562410	test: 1.7522488	best: 1.7522488 (0)	total: 7.31s	remaining: 1m 5s
1:	learn: 1.6335829	test: 1.6277405	best: 1.6277405 (1)	total: 13.9s	remaining: 55.7s
2:	learn: 1.5832596	test: 1.5779551	best: 1.5779551 (2)	total: 20.5s	remaining: 47.8s
3:	learn: 1.5233950	test: 1.5040351	best: 1.5040351 (3)	total: 27.7s	remaining: 41.5s
4:	learn: 1.4911665	test: 1.4683670	best: 1.4683670 (4)	total: 34.8s	remaining: 34.8s
5:	learn: 1.4703070	test: 1.4442740	best: 1.4442740 (5)	total: 42.1s	remaining: 28.1s
6:	learn: 1.4646369	test: 1.4396849	best: 1.4396849 (6)	total: 48.6s	remaining: 20.8s
7:	learn: 1.4536568	test: 1.4265161	best: 1.4265161 (7)	total: 55.3s	remaining: 13.8s
8:	learn: 1.4499167	test: 1.4234321	best: 1.4234321 (8)	total: 1m 2s	remaining: 6.92s
9:	learn: 1.4456287	test: 1.4185576	best: 1.4185576 (9)	total: 1m 9s	remaining: 0us

bestTest = 1.418557606
bestIteration = 9

Training on fold [1/5]
0:	learn: 1.7744649	test: 1.7733193	best: 1.7733193 (

In [37]:
cv_data

Unnamed: 0,iterations,test-MultiClass-mean,test-MultiClass-std,train-MultiClass-mean,train-MultiClass-std,test-AUC-mean,test-AUC-std
0,0,1.759956,0.01081,1.763232,0.010799,,
1,1,1.629908,0.006046,1.634469,0.004943,,
2,2,1.560736,0.019535,1.567254,0.014998,,
3,3,1.494957,0.008084,1.511948,0.00826,,
4,4,1.463447,0.003155,1.484266,0.004598,,
5,5,1.441807,0.003096,1.464652,0.004347,,
6,6,1.4304,0.005581,1.452441,0.007609,,
7,7,1.42213,0.005426,1.444111,0.007242,,
8,8,1.41819,0.004261,1.439575,0.007282,,
9,9,1.414279,0.003787,1.434713,0.007436,,


In [40]:
model = CatBoostClassifier(iterations=5, loss_function='MultiClass')

grid = {'learning_rate': [1, 0.1],
        # 'depth': [4, 6, 10],
        # 'l2_leaf_reg': [1, 3, 5, 7, 9]
        }

model.grid_search(grid, train_dataset, cv=5, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.7029049	test: 1.6926557	best: 1.6926557 (0)	total: 464ms	remaining: 1.86s
1:	learn: 1.5976605	test: 1.5849377	best: 1.5849377 (1)	total: 906ms	remaining: 1.36s
2:	learn: 1.5278995	test: 1.5131610	best: 1.5131610 (2)	total: 1.4s	remaining: 932ms
3:	learn: 1.4858666	test: 1.4551280	best: 1.4551280 (3)	total: 1.97s	remaining: 493ms
4:	learn: 1.4727156	test: 1.4396113	best: 1.4396113 (4)	total: 2.44s	remaining: 0us

bestTest = 1.439611315
bestIteration = 4

0:	loss: 1.4396113	best: 1.4396113 (0)	total: 4.73s	remaining: 4.73s
0:	learn: 2.1525362	test: 2.1503242	best: 2.1503242 (0)	total: 563ms	remaining: 2.25s
1:	learn: 2.0405943	test: 2.0384004	best: 2.0384004 (1)	total: 1.1s	remaining: 1.66s
2:	learn: 1.9565216	test: 1.9535005	best: 1.9535005 (2)	total: 1.68s	remaining: 1.12s
3:	learn: 1.8863474	test: 1.8828495	best: 1.8828495 (3)	total: 2.31s	remaining: 579ms
4:	learn: 1.8323315	test: 1.8286035	best: 1.8286035 (4)	total: 2.82s	remaining: 0us

bestTest = 1.828603485
bestIterat

{'params': {'learning_rate': 1},
 'cv_results': defaultdict(list,
             {'iterations': [0, 1, 2, 3, 4],
              'test-MultiClass-mean': [1.6972951772341625,
               1.6222251364609312,
               1.5518330146803385,
               1.5144436823422331,
               1.4801437502491643],
              'test-MultiClass-std': [0.008204414387829309,
               0.01469226363155301,
               0.01934351670190613,
               0.05672760162515525,
               0.03919955509022627],
              'train-MultiClass-mean': [1.7052322034796912,
               1.6273642776520194,
               1.5576171974826518,
               1.5321304809046388,
               1.501437234759858],
              'train-MultiClass-std': [0.006454214866988337,
               0.014728590365594665,
               0.022074555936313002,
               0.05457242286205021,
               0.04296160636237955]})}

In [41]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')

0.7400699318719113

In [46]:
cat_features = ['actors', 'director', 'genre']
text_features = ['title', 'synopsis_lemmatized']

model = CatBoostClassifier(iterations=5, loss_function='MultiClass')

grid = {'learning_rate': [1, 0.1],
        # 'depth': [4, 6, 10],
        # 'l2_leaf_reg': [1, 3, 5, 7, 9]
        }

search = GridSearchCV(model, grid, cv=5, scoring='roc_auc_ovr', verbose=3, n_jobs=-1)
search.fit(X_train, y_train, cat_features = cat_features, text_features = text_features)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
