In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

In [2]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report

In [3]:
import functions

In [4]:
imdb_df = pd.read_csv('data/final_imdb.csv')

In [23]:
imdb_df.actors[0].strip("']['").split(', ')

["John Dall'", "'Peggy Cummins'", "'Berry Kroeger'", "'Morris Carnovsky"]

## Data Preparation

In [10]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'action', 'adult', 'adventure', 'animation',
       'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
       'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
       'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
       'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
       'tv_series'],
      dtype='object')

In [11]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89674 entries, 0 to 89673
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   actors            89674 non-null  object 
 1   director          89359 non-null  object 
 2   duration          81430 non-null  float64
 3   genre             88365 non-null  object 
 4   imdb_rating       89674 non-null  float64
 5   link              89674 non-null  object 
 6   synopsis          89672 non-null  object 
 7   title             89673 non-null  object 
 8   votes             89674 non-null  float64
 9   page_url          89674 non-null  object 
 10  page_url_cleaned  89674 non-null  object 
 11  release_start     89674 non-null  float64
 12  action            89674 non-null  int64  
 13  adult             89674 non-null  int64  
 14  adventure         89674 non-null  int64  
 15  animation         89674 non-null  int64  
 16  biography         89674 non-null  int64 

In [12]:
model_df = imdb_df[[
    'imdb_rating', 'duration',  'votes',
    'release_start', 'action', 'adult', 'adventure', 'animation',
    'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
    'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
    'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
    'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
    'tv_series'
]].dropna()

In [9]:
model_df.shape

(81430, 34)

In [10]:
px.histogram(model_df, 'imdb_rating')

Binning the countinous variable into groups to be able to achieve classification tasks.

In [47]:
model_df['imdb_rating_cat'] = pd.cut(model_df['imdb_rating'], bins=[0, 2, 4, 5, 6, 7, 8, 10], right=True, labels=False) 
px.histogram(model_df, 'imdb_rating', color = 'imdb_rating_cat')

In [48]:
dep_var = f'imdb_rating_cat'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat'])

In [49]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
X_train.shape, X_test.shape

((65144, 33), (16286, 33))

## Model training

In [52]:
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
model.score(X_test, y_test)

0.3240206312169962

In [64]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        37
           1       0.00      0.00      0.00       947
           2       0.00      0.00      0.00      1809
           3       0.00      0.00      0.00      3626
           4       0.32      0.99      0.49      5249
           5       0.43      0.02      0.04      3594
           6       0.90      0.01      0.02      1024

    accuracy                           0.32     16286
   macro avg       0.24      0.15      0.08     16286
weighted avg       0.26      0.32      0.17     16286




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [54]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class = 'ovr')

0.5152430078471484

In [61]:
f1_score(y_test, model.predict(X_test), average=None)

array([0.        , 0.        , 0.        , 0.        , 0.48653963,
       0.03829787, 0.01740812])

## Naive Bayes on Synopsis

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import remove_stopwords
import re

In [14]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'action', 'adult', 'adventure', 'animation',
       'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
       'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
       'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
       'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
       'tv_series'],
      dtype='object')

In [69]:
text_df = imdb_df[['imdb_rating', 'synopsis']].dropna()
# Revome add plot
text_df = text_df[text_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)
# Remove punctuation
text_df['synopsis'] = text_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Convert the titles to lowercase
text_df['synopsis'] = text_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
text_df['synopsis'] = text_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Print out the first rows of papers
text_df['synopsis'] = text_df['synopsis'].str.strip()
# Remove stopwords
text_df['synopsis'] = [remove_stopwords(i) for i in text_df['synopsis']]

In [70]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79840 entries, 0 to 79839
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdb_rating  79840 non-null  float64
 1   synopsis     79840 non-null  object 
dtypes: float64(1), object(1)
memory usage: 1.2+ MB


In [71]:
np.unique(np.array([str(type(i)) for i in text_df['synopsis']]))

array(["<class 'str'>"], dtype='<U13')

In [72]:
text_df['imdb_rating_cat'] = pd.cut(text_df['imdb_rating'], bins=[0, 2, 4, 5, 6, 7, 8, 10], right=True, labels=['0-2', '2-4', '4-5', '5-6', '6-7', '7-8', '8-10']) 

In [73]:
text_df.imdb_rating_cat.value_counts()

6-7     25714
7-8     18157
5-6     17268
4-5      8563
8-10     5413
2-4      4514
0-2       211
Name: imdb_rating_cat, dtype: int64

In [74]:
text_df_tmp = text_df.iloc[0:70000]

In [75]:
# Build the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
# Train the model using the training data
model.fit(text_df['synopsis'], text_df['imdb_rating_cat'])
# Predict the categories of the test data
predicted_categories = model.predict(text_df['synopsis'])

In [76]:
model.score(text_df['synopsis'], text_df['imdb_rating_cat'])

0.4258892785571142