In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import os

In [3]:
titles = pd.read_csv('../../preprocessing/titles_preprocessed.csv')
credits = pd.read_csv('../../preprocessing/credits_preprocessed.csv')

In [4]:
titles.head()

Unnamed: 0,id,title,type,description,release_year,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity
0,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],0.0,tt0017925,8.2,89766.0,8.647
1,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,92,"['comedy', 'drama', 'romance']",['US'],0.0,tt0032599,7.8,57835.0,11.27
2,tm19424,Detour,MOVIE,"The life of Al Roberts, a pianist in a New Yor...",1945,66,"['thriller', 'drama', 'crime']",['US'],0.0,tt0037638,7.3,17233.0,7.757
3,tm112005,Marihuana,MOVIE,A young girl named Burma attends a beach party...,1936,57,"['crime', 'drama']",['US'],0.0,tt0026683,4.0,864.0,3.748
4,tm22806,Intolerance: Love's Struggle Throughout the Ages,MOVIE,"The story of a poor young woman, separated by ...",1916,197,"['history', 'drama']",['US'],0.0,tt0006864,7.7,15242.0,9.412


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [6]:
# I'm going to use the K Nearest Neighbours method to classify the actors and directors into different groups:
# 1. High rated
# 2. Medium rated
# 3. Low rated
# based on their IMDB score.


In [7]:
# Preprocessing the data

# Calculate the average IMDb score for each person_id
person_scores_df = titles.groupby('id')['imdb_score'].mean().reset_index()
person_scores_df.rename(columns={'imdb_score': 'average_imdb_score'}, inplace=True)

# Merge the average IMDb scores with the actors/directors dataset
merged_df = pd.merge(credits, person_scores_df, on='id', how='left')

# Drop rows with missing average IMDb scores
merged_df.dropna(subset=['average_imdb_score'], inplace=True)

# IMDB score bins (0-4 - Low, 4-7 - Medium, 7-10 - High)
bins = [0, 4, 7, 10]
# Class labels
labels = ['Low-Rated', 'Medium-Rated', 'High-Rated']  

# Bin the IMDb scores into different ranges and assign labels
merged_df['imdb_score_bin'] = pd.cut(merged_df['average_imdb_score'], bins=bins, labels=labels)

merged_df

Unnamed: 0,person_id,id,name,character,role,average_imdb_score,imdb_score_bin
0,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR,8.200000,High-Rated
1,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR,8.200000,High-Rated
2,28714,tm19248,Glen Cavender,Captain Anderson,ACTOR,8.200000,High-Rated
3,28715,tm19248,Jim Farley,General Thatcher,ACTOR,8.200000,High-Rated
4,27348,tm19248,Frederick Vroom,A Southern General,ACTOR,8.200000,High-Rated
...,...,...,...,...,...,...,...
38898,321201,tm1055776,Piotr Trzaskalski,director,DIRECTOR,6.046369,Medium-Rated
38899,28071,tm975981,Cher,Herself,ACTOR,7.600000,High-Rated
38900,399383,tm975981,Nick Daley,Narrator,ACTOR,7.600000,High-Rated
38901,1032862,tm975981,Jonathan Finnigan,director,DIRECTOR,7.600000,High-Rated


In [8]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report
from termcolor import colored

In [9]:
def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))

In [10]:
# Preparing the data for KNN classification

# Splitting the data into feature X and target Y
X = merged_df[['person_id', 'average_imdb_score']]  # Features: person_id and imdb_score from actors/directors dataset
y = merged_df['imdb_score_bin']  # Target: IMDb score bin from merged dataset

# Splitting the data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=41)


### Checking for outliers

In [11]:
from scipy import stats

z_scores = stats.zscore(titles['imdb_score'])
outliers = titles[abs(z_scores) > 3]

outliers

Unnamed: 0,id,title,type,description,release_year,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity
732,tm92812,Track of the Moon Beast,MOVIE,"Professor ""Johnny Longbow"" Salina, a man who r...",1976,90,"['scifi', 'horror']",['US'],0.0,tt0075343,2.2,3651.0,3.361
813,tm13572,Curse of Bigfoot,MOVIE,A group of high school students on an archaeol...,1978,88,['horror'],['US'],0.0,tt0074365,1.8,683.0,1.487
1370,ts25975,My Super Sweet 16,SHOW,My Super Sweet 16 is a MTV reality series docu...,2005,21,"['reality', 'documentation']",['US'],10.0,tt0445890,1.7,3092.0,3.667
2194,tm416271,Another Soul,MOVIE,A couple on the run battle to save their daugh...,2018,78,"['horror', 'thriller']",['US'],0.0,tt5165620,2.2,291.0,4.868
2395,ts215433,Baby Shark's Big Show!,SHOW,"PINKFONG's popular brand, Baby Shark, is back ...",2020,13,"['animation', 'comedy', 'family', 'fantasy']",['US'],2.0,tt10518284,2.1,534.0,13.710777
2400,ts89519,Ryan's Mystery Playdate,SHOW,"Ryan's Mystery Playdate follows Ryan, his pare...",2019,23,['family'],['US'],3.0,tt14778646,1.8,407.0,3.504
2524,ts280372,AwesomenessTV's Next Influencer,SHOW,ATV's Next Influencer follows a group of up-an...,2020,27,['reality'],['US'],3.0,tt13918642,2.1,104.0,0.91
2616,tm433045,Gully,MOVIE,"A slightly dystopian vision of LA, we follow t...",2021,81,"['drama', 'crime']",['US'],0.0,tt5013984,1.7,17911.0,21.533


Since we don't have many outliers, we can leave them.

In [12]:
# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Train the KNN classifier
knn.fit(X_train, y_train)

In [13]:
y_pred = knn.predict(X_test)

In [14]:
# Evaluate the KNN model
report(knn, X_train, y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-Rated       0.75      0.53      0.62      6058
   Low-Rated       0.67      0.32      0.43      1332
Medium-Rated       0.85      0.95      0.90     21278

    accuracy                           0.83     28668
   macro avg       0.75      0.60      0.65     28668
weighted avg       0.82      0.83      0.82     28668



### Classification report

##### Precision: 
*true positives : (true positives + false positives)*

* **High-Rated class** - 75% of the predicted High-Rated instances were actually High-Rated.
* **Medium-Rated class** - 85% of the predicted Medium-Rated instances were actually Medium-Rated.
* **Low-Rated class** - 67% of the predicted Low-Rated instances were actually Low-Rated.

##### Recall
*true positives : (true positives + false negatives)*

* **High-Rated class** - The model correctly identified 53% of the actual High-Rated instances.
* **Medium-Rated class** - The model correctly identified 95% of the actual Medium-Rated instances.
* **Low-Rated class** - The model correctly identified 32% of the actual Low-Rated instances.

##### F1-score
*harmonic mean of precision and recall*

* **High-Rated class** - 0.62
* **Medium-Rated class** - 0.90
* **Low-Rated class** - 0.43

##### Support
*the number of samples or instances of each class in the test dataset*

* **High-Rated class** - 6058
* **Medium-Rated class** - 21278
* **Low-Rated class** - 1332

In [15]:
report(knn, X_test, y_test, "test")

[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-Rated       0.57      0.39      0.46      2021
   Low-Rated       0.42      0.25      0.31       436
Medium-Rated       0.81      0.91      0.86      7100

    accuracy                           0.77      9557
   macro avg       0.60      0.51      0.54      9557
weighted avg       0.74      0.77      0.75      9557



In [3]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a heatmap for the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='PuRd',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

NameError: name 'y_test' is not defined

In [17]:
cm


array([[ 779,   28, 1214],
       [  38,  109,  289],
       [ 538,  125, 6437]], dtype=int64)

### Confusion matrix

##### For the Low-Rated class:
* 779 samples were correctly predicted as Low-Rated
* 28 samples were predicted as Medium-Rated but were actually Low-Rated
* 1214 samples were predicted as High-Rated but were actually Low-Rated

##### For the Medium-Rated class:
* 38 samples were predicted as Low-Rated but were actually Medium-Rated
* 109 samples were correctly predicted as Medium-Rated 
* 289 samples were predicted as High-Rated but were actually Medium-Rated 

##### For the High-Rated class:
* 28 samples were predicted as Low-Rated but were actually High-Rated
* 125 samples were predicted as Medium-Rated but were actually High-Rated
* 6437 samples were correctly predicted as High-Rated


### GridSearchCV


In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
params = {'criterion': ['gini', 'entropy'],
          'max_depth': [2,4,6, 8]
         }
params_grid = {'n_neighbors': range(10, 50, 5),
               'weights': ['uniform', 'distance'],
                'p': [1, 2]}

#estimator = GridSearchCV(KNeighborsClassifier(), params_grid, cv=6)
estimator = GridSearchCV(KNeighborsClassifier(), params_grid, cv=6, verbose=4)

In [20]:
estimator.fit(X_train, y_train)

Fitting 6 folds for each of 32 candidates, totalling 192 fits
[CV 1/6] END n_neighbors=10, p=1, weights=uniform;, score=0.768 total time=   0.0s
[CV 2/6] END n_neighbors=10, p=1, weights=uniform;, score=0.766 total time=   0.0s
[CV 3/6] END n_neighbors=10, p=1, weights=uniform;, score=0.768 total time=   0.0s
[CV 4/6] END n_neighbors=10, p=1, weights=uniform;, score=0.764 total time=   0.0s
[CV 5/6] END n_neighbors=10, p=1, weights=uniform;, score=0.767 total time=   0.0s
[CV 6/6] END n_neighbors=10, p=1, weights=uniform;, score=0.770 total time=   0.0s
[CV 1/6] END n_neighbors=10, p=1, weights=distance;, score=0.791 total time=   0.0s
[CV 2/6] END n_neighbors=10, p=1, weights=distance;, score=0.786 total time=   0.0s
[CV 3/6] END n_neighbors=10, p=1, weights=distance;, score=0.796 total time=   0.0s
[CV 4/6] END n_neighbors=10, p=1, weights=distance;, score=0.785 total time=   0.0s
[CV 5/6] END n_neighbors=10, p=1, weights=distance;, score=0.789 total time=   0.0s
[CV 6/6] END n_neigh

[CV 4/6] END n_neighbors=30, p=1, weights=uniform;, score=0.757 total time=   0.0s
[CV 5/6] END n_neighbors=30, p=1, weights=uniform;, score=0.755 total time=   0.0s
[CV 6/6] END n_neighbors=30, p=1, weights=uniform;, score=0.758 total time=   0.0s
[CV 1/6] END n_neighbors=30, p=1, weights=distance;, score=0.800 total time=   0.0s
[CV 2/6] END n_neighbors=30, p=1, weights=distance;, score=0.793 total time=   0.0s
[CV 3/6] END n_neighbors=30, p=1, weights=distance;, score=0.798 total time=   0.0s
[CV 4/6] END n_neighbors=30, p=1, weights=distance;, score=0.795 total time=   0.0s
[CV 5/6] END n_neighbors=30, p=1, weights=distance;, score=0.796 total time=   0.0s
[CV 6/6] END n_neighbors=30, p=1, weights=distance;, score=0.808 total time=   0.0s
[CV 1/6] END n_neighbors=30, p=2, weights=uniform;, score=0.756 total time=   0.0s
[CV 2/6] END n_neighbors=30, p=2, weights=uniform;, score=0.759 total time=   0.0s
[CV 3/6] END n_neighbors=30, p=2, weights=uniform;, score=0.756 total time=   0.0

In [21]:
# Let's see what the best parameters are based on the estimator:

estimator.best_params_

{'n_neighbors': 45, 'p': 1, 'weights': 'distance'}

In [22]:
# What about the best score?

estimator.best_score_

0.8010673922143154

In [23]:
report(estimator.best_estimator_, X_train, y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-Rated       1.00      1.00      1.00      6058
   Low-Rated       1.00      1.00      1.00      1332
Medium-Rated       1.00      1.00      1.00     21278

    accuracy                           1.00     28668
   macro avg       1.00      1.00      1.00     28668
weighted avg       1.00      1.00      1.00     28668



In [24]:
report(estimator.best_estimator_, X_test, y_test, "test")


[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-Rated       0.70      0.42      0.52      2021
   Low-Rated       0.62      0.34      0.44       436
Medium-Rated       0.82      0.94      0.88      7100

    accuracy                           0.80      9557
   macro avg       0.71      0.56      0.61      9557
weighted avg       0.79      0.80      0.78      9557



## Ensembles 

In [25]:
from sklearn.ensemble import BaggingClassifier

In [26]:
baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=25)
baggingKnn.fit(X_train, y_train)

report(baggingKnn, X_train, y_train)
report(baggingKnn, X_test, y_test, "test")


[32mClassification report for model BaggingClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-Rated       0.81      0.51      0.63      6058
   Low-Rated       0.76      0.30      0.43      1332
Medium-Rated       0.85      0.97      0.90     21278

    accuracy                           0.84     28668
   macro avg       0.81      0.59      0.65     28668
weighted avg       0.83      0.84      0.82     28668

[32mClassification report for model BaggingClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-Rated       0.63      0.37      0.46      2021
   Low-Rated       0.50      0.24      0.32       436
Medium-Rated       0.81      0.93      0.86      7100

    accuracy                           0.78      9557
   macro avg       0.64      0.51  

## Feature Engineering

Feature engineering involves creating new features or transforming existing features to provide additional information to the model for better predictions.
In our case, we're going to use TMDB popularity scores as a feature that could improve the accuracy of our classification model.

In [27]:
# Preprocessing the data

# Calculating the average IMDb score and average TMDB popularity for each person_id
person_scores_fe = titles.groupby('id').agg({'imdb_score': 'mean', 'tmdb_popularity': 'mean'}).reset_index()
person_scores_fe.rename(columns={'imdb_score': 'average_imdb_score', 'tmdb_popularity': 'average_tmdb_popularity'}, inplace=True)

# Merging the average IMDb scores and average TMDB popularity with the actors/directors dataset
merged_fe = pd.merge(credits, person_scores_fe, on='id', how='left')

# Drop rows with missing average IMDb scores or TMDB popularity
merged_fe.dropna(subset=['average_imdb_score', 'average_tmdb_popularity'], inplace=True)

merged_fe

Unnamed: 0,person_id,id,name,character,role,average_imdb_score,average_tmdb_popularity
0,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR,8.200000,8.647
1,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR,8.200000,8.647
2,28714,tm19248,Glen Cavender,Captain Anderson,ACTOR,8.200000,8.647
3,28715,tm19248,Jim Farley,General Thatcher,ACTOR,8.200000,8.647
4,27348,tm19248,Frederick Vroom,A Southern General,ACTOR,8.200000,8.647
...,...,...,...,...,...,...,...
38898,321201,tm1055776,Piotr Trzaskalski,director,DIRECTOR,6.046369,0.898
38899,28071,tm975981,Cher,Herself,ACTOR,7.600000,1.758
38900,399383,tm975981,Nick Daley,Narrator,ACTOR,7.600000,1.758
38901,1032862,tm975981,Jonathan Finnigan,director,DIRECTOR,7.600000,1.758


#### Normalizing average TMDB popularity

In [28]:
from sklearn.preprocessing import MinMaxScaler

# Normalize TMDB popularity using Min-Max scaling
scaler = MinMaxScaler()
merged_fe['normalized_tmdb_popularity'] = scaler.fit_transform(merged_fe['average_tmdb_popularity'].values.reshape(-1, 1))

In [29]:
# Defining bins for IMDb scores and normalized TMDB popularity
imdb_bins = [0, 4, 7, 10]  # Low-rated, Medium-rated, High-rated
tmdb_bins = [-1, -0.5, 0.5, 1]    # Low-popularity, Medium-popularity, High-popularity
imdb_labels = ['Low-rated', 'Medium-rated', 'High-rated']
tmdb_labels = ['Low-popularity', 'Medium-popularity', 'High-popularity']

merged_fe['imdb_score_bin'] = pd.cut(merged_fe['average_imdb_score'], bins=imdb_bins, labels=imdb_labels)
merged_fe['tmdb_popularity_bin'] = pd.cut(merged_fe['normalized_tmdb_popularity'], bins=tmdb_bins, labels=tmdb_labels)

merged_fe

Unnamed: 0,person_id,id,name,character,role,average_imdb_score,average_tmdb_popularity,normalized_tmdb_popularity,imdb_score_bin,tmdb_popularity_bin
0,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
1,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
2,28714,tm19248,Glen Cavender,Captain Anderson,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
3,28715,tm19248,Jim Farley,General Thatcher,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
4,27348,tm19248,Frederick Vroom,A Southern General,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
...,...,...,...,...,...,...,...,...,...,...
38898,321201,tm1055776,Piotr Trzaskalski,director,DIRECTOR,6.046369,0.898,0.000065,Medium-rated,Medium-popularity
38899,28071,tm975981,Cher,Herself,ACTOR,7.600000,1.758,0.000253,High-rated,Medium-popularity
38900,399383,tm975981,Nick Daley,Narrator,ACTOR,7.600000,1.758,0.000253,High-rated,Medium-popularity
38901,1032862,tm975981,Jonathan Finnigan,director,DIRECTOR,7.600000,1.758,0.000253,High-rated,Medium-popularity


In [30]:
# Split the dataset into features and target
X_fe = merged_fe[['person_id', 'normalized_tmdb_popularity', 'average_imdb_score']]
y_fe = merged_fe['imdb_score_bin']

merged_fe

Unnamed: 0,person_id,id,name,character,role,average_imdb_score,average_tmdb_popularity,normalized_tmdb_popularity,imdb_score_bin,tmdb_popularity_bin
0,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
1,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
2,28714,tm19248,Glen Cavender,Captain Anderson,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
3,28715,tm19248,Jim Farley,General Thatcher,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
4,27348,tm19248,Frederick Vroom,A Southern General,ACTOR,8.200000,8.647,0.001758,High-rated,Medium-popularity
...,...,...,...,...,...,...,...,...,...,...
38898,321201,tm1055776,Piotr Trzaskalski,director,DIRECTOR,6.046369,0.898,0.000065,Medium-rated,Medium-popularity
38899,28071,tm975981,Cher,Herself,ACTOR,7.600000,1.758,0.000253,High-rated,Medium-popularity
38900,399383,tm975981,Nick Daley,Narrator,ACTOR,7.600000,1.758,0.000253,High-rated,Medium-popularity
38901,1032862,tm975981,Jonathan Finnigan,director,DIRECTOR,7.600000,1.758,0.000253,High-rated,Medium-popularity


In [31]:
merged_fe = merged_fe.dropna(subset=["average_imdb_score", "normalized_tmdb_popularity"])

In [32]:
# Splitting the data into testing and training sets
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_fe, y_fe, test_size=0.20, random_state=41)

# Initialize the KNN classifier
knn_fe = KNeighborsClassifier()

In [33]:
# Train the KNN classifier
knn_fe.fit(X_train_fe, y_train_fe)

In [34]:
report(knn_fe, X_train_fe, y_train_fe)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

  High-rated       0.76      0.53      0.63      6466
   Low-rated       0.65      0.32      0.43      1415
Medium-rated       0.85      0.95      0.90     22699

    accuracy                           0.83     30580
   macro avg       0.75      0.60      0.65     30580
weighted avg       0.82      0.83      0.82     30580



In [42]:
y_pred_fe = knn_fe.predict(X_train_fe)

In [54]:
cm_fe = confusion_matrix(y_train_fe, y_pred_fe)
cm

array([[ 779,   28, 1214],
       [  38,  109,  289],
       [ 538,  125, 6437]], dtype=int64)