In [66]:
import pandas as pd
import numpy as np

import plotly.express as px

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report
from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import GridSearchCV

In [68]:
imdb_df = pd.read_csv('data/imdb_encoded_with_topics.csv')

## Data Preparation

In [69]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors', 'synopsis_lemmatized',
       'topic'],
      dtype='object')

In [70]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161602 entries, 0 to 161601
Data columns (total 48 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   actor1               161602 non-null  int64  
 1   actor2               161602 non-null  int64  
 2   actor3               161602 non-null  int64  
 3   actor4               161602 non-null  int64  
 4   director_enc         161602 non-null  int64  
 5   action               161602 non-null  int64  
 6   adult                161602 non-null  int64  
 7   adventure            161602 non-null  int64  
 8   animation            161602 non-null  int64  
 9   biography            161602 non-null  int64  
 10  comedy               161602 non-null  int64  
 11  crime                161602 non-null  int64  
 12  documentary          161602 non-null  int64  
 13  drama                161602 non-null  int64  
 14  family               161602 non-null  int64  
 15  fantasy          

In [71]:
model_df = imdb_df.dropna()

In [72]:
model_df.shape

(160422, 48)

In [73]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [74]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Model training

In [75]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors', 'synopsis_lemmatized',
       'topic'],
      dtype='object')

In [76]:
dep_var = f'imdb_rating_cat'
indep_vars = ['genre', 'duration', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'director', 'actors', 'synopsis_lemmatized']

In [77]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [78]:
X.columns

Index(['genre', 'duration', 'votes', 'release_start', 'release_month',
       'tv_series', 'title', 'director', 'actors', 'synopsis_lemmatized'],
      dtype='object')

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
X_train.shape, X_test.shape

((128337, 10), (32085, 10))

In [88]:
cat_features = ['actors', 'director', 'genre']
text_features = ['title', 'synopsis_lemmatized']

model = CatBoostClassifier(iterations=100, loss_function='MultiClass')

grid = {'learning_rate': [1, 0.5],
        'depth': [4, 6, 10],
        }

search = GridSearchCV(model, grid, cv=5, scoring='roc_auc_ovr', verbose=3, n_jobs=-1)
search.fit(X_train, y_train, cat_features = cat_features, text_features = text_features)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [82]:
search.score(X_test, y_test)

0.7400699318719113

In [87]:
search.best_score_

0.7542179681602769