In [3]:
# Importing cleaned data
import pandas as pd
final_df = pd.read_parquet("data/final_movie_data.parquet")
print(final_df.head())

      tconst  startYear  runtimeMinutes                      genres  \
0  tt0000574       1906            70.0  Action,Adventure,Biography   
1  tt0002130       1911            71.0     Adventure,Drama,Fantasy   
2  tt0002423       1919           113.0     Biography,Drama,Romance   
3  tt0002844       1913            54.0                 Crime,Drama   
4  tt0003014       1913            96.0                       Drama   

   averageRating  numVotes                      directors  \
0            6.0      1013                      nm0846879   
1            7.0      3879  nm0078205,nm0655824,nm0209738   
2            6.6      1087                      nm0523932   
3            6.9      2655                      nm0275421   
4            7.0      1549                      nm0803705   

                         writers    actor_1    actor_2    actor_3  
0                      nm0846879  nm0846887  nm0846894  nm1431224  
1                      nm0019604  nm0660139  nm0685283  nm0209738  
2 

In [4]:
# One-hot encoding genres
genres_dummies = final_df['genres'].str.replace(' ', '').str.replace(',', '|').str.get_dummies('|')
final_df = pd.concat([final_df, genres_dummies], axis=1)
final_df = final_df.drop(columns=['genres'])
print(final_df.head())


      tconst  startYear  runtimeMinutes  averageRating  numVotes  \
0  tt0000574       1906            70.0            6.0      1013   
1  tt0002130       1911            71.0            7.0      3879   
2  tt0002423       1919           113.0            6.6      1087   
3  tt0002844       1913            54.0            6.9      2655   
4  tt0003014       1913            96.0            7.0      1549   

                       directors                        writers    actor_1  \
0                      nm0846879                      nm0846879  nm0846887   
1  nm0078205,nm0655824,nm0209738                      nm0019604  nm0660139   
2                      nm0523932            nm0266183,nm0473134  nm0624470   
3                      nm0275421  nm0019855,nm0275421,nm0816232  nm0622772   
4                      nm0803705            nm0472236,nm0803705  nm0096737   

     actor_2    actor_3  ...  Music  Musical  Mystery  News  Romance  Sci-Fi  \
0  nm0846894  nm1431224  ...      0       

In [5]:
# Making df for directors
directors = final_df[['tconst', 'directors']].dropna()
directors['directors_nconst'] = directors['directors'].str.split(',')
directors = directors.explode('directors_nconst')

# Making df for writers
writers  = final_df[['tconst', 'writers']].dropna()
writers['writers_nconst'] = writers['writers'].str.split(',')
writers = writers.explode('writers_nconst')


In [6]:
# Getting avg ratings for directors, writers, and actors

movie_ratings  = final_df.set_index('tconst')['averageRating']

director_ratings = pd.merge(directors, movie_ratings, on='tconst')
director_scores  = director_ratings.groupby('directors_nconst')['averageRating'].mean()

writer_ratings = pd.merge(writers, movie_ratings, on='tconst')
writer_scores  = writer_ratings.groupby('writers_nconst')['averageRating'].mean()

actor_1_ratings = final_df.groupby('actor_1')['averageRating'].mean()
actor_2_ratings = final_df.groupby('actor_2')['averageRating'].mean()
actor_3_ratings = final_df.groupby('actor_3')['averageRating'].mean()

actor_scores = pd.concat([actor_1_ratings, actor_2_ratings, actor_3_ratings]).groupby(level=0).mean()


In [7]:
# Adding scores for directors, writers, and actors to final_df as columns for training
final_df['director_score'] = final_df['directors'].str.split(',').str[0].map(director_scores)
final_df['writer_score'] = final_df['writers'].str.split(',').str[0].map(writer_scores)

final_df['actor_1_score'] = final_df['actor_1'].map(actor_scores)
final_df['actor_2_score'] = final_df['actor_2'].map(actor_scores)
final_df['actor_3_score'] = final_df['actor_3'].map(actor_scores)

overall_avg_rating = final_df['averageRating'].mean()
final_df.fillna({'director_score': overall_avg_rating,
                  'writer_score': overall_avg_rating,
                  'actor_1_score': overall_avg_rating,
                  'actor_2_score': overall_avg_rating,
                  'actor_3_score': overall_avg_rating}, inplace=True)
print(final_df.head())

      tconst  startYear  runtimeMinutes  averageRating  numVotes  \
0  tt0000574       1906            70.0            6.0      1013   
1  tt0002130       1911            71.0            7.0      3879   
2  tt0002423       1919           113.0            6.6      1087   
3  tt0002844       1913            54.0            6.9      2655   
4  tt0003014       1913            96.0            7.0      1549   

                       directors                        writers    actor_1  \
0                      nm0846879                      nm0846879  nm0846887   
1  nm0078205,nm0655824,nm0209738                      nm0019604  nm0660139   
2                      nm0523932            nm0266183,nm0473134  nm0624470   
3                      nm0275421  nm0019855,nm0275421,nm0816232  nm0622772   
4                      nm0803705            nm0472236,nm0803705  nm0096737   

     actor_2    actor_3  ...  Sci-Fi  Sport  Thriller  War  Western  \
0  nm0846894  nm1431224  ...       0      0        

In [8]:
# Defining features and target variable for baseline training
X = final_df.drop(columns=['tconst', 'averageRating', 'numVotes','directors', 'writers', 'actor_1', 'actor_2', 'actor_3'])
y = final_df['averageRating']

In [9]:
# Making train test split 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Training model and testing with baseline parameters

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train the model
xgbr = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, objective='reg:squarederror')
xgbr.fit(X_train, y_train)

# Make predictions
y_pred = xgbr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGBoost Regressor MSE: {mse}")
print(f"XGBoost Regressor R^2: {r2}")

XGBoost Regressor MSE: 0.20132098848270114
XGBoost Regressor R^2: 0.8579807258760823


In [11]:
import pandas as pd

# Get feature importance scores from the trained model
feature_importances = xgbr.feature_importances_

# Create a pandas Series to make it easier to view and sort
importances = pd.Series(feature_importances, index=X.columns)

# Sort the feature importances in descending order and print them
sorted_importances = importances.sort_values(ascending=False)

print("Feature Importances (from most to least important):")
print(sorted_importances)

Feature Importances (from most to least important):
director_score    0.408644
writer_score      0.376370
actor_3_score     0.060442
actor_2_score     0.033722
actor_1_score     0.019372
Drama             0.010963
Biography         0.010865
Documentary       0.009511
Crime             0.006806
Sport             0.005089
runtimeMinutes    0.004987
startYear         0.004696
Western           0.004504
Music             0.004392
Fantasy           0.004141
Adventure         0.004066
Comedy            0.003923
Mystery           0.003599
History           0.002946
Film-Noir         0.002738
War               0.002536
Animation         0.002442
Action            0.002439
Thriller          0.002383
Sci-Fi            0.002204
Horror            0.002152
Romance           0.002034
Family            0.001291
Musical           0.000744
News              0.000000
dtype: float32


In [12]:
# Selecting only the top 8 features

top_feature_names = sorted_importances.index[:8]
print("Top 8 Feature Names:")
print(top_feature_names)
X_top_features = X[top_feature_names]
print(X_top_features)

Top 8 Feature Names:
Index(['director_score', 'writer_score', 'actor_3_score', 'actor_2_score',
       'actor_1_score', 'Drama', 'Biography', 'Documentary'],
      dtype='object')
       director_score  writer_score  actor_3_score  actor_2_score  \
0            6.000000      6.000000       6.000000       6.000000   
1            7.000000      6.700000       7.000000       7.000000   
2            7.200000      6.600000       6.525000       7.425000   
3            6.957143      6.825000       6.833333       6.843333   
4            7.528571      7.000000       7.000000       7.000000   
...               ...           ...            ...            ...   
46173        6.700000      6.800000       6.150000       6.125000   
46174        5.450000      5.185714       5.500000       6.050000   
46175        5.240000      7.400000       6.700000       7.200000   
46176        5.800000      5.800000       6.887500       5.822222   
46177        6.400000      6.400000       6.175000       6.58

In [13]:
# Training new model on selected features

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import xgboost as xgb
import  numpy as np

X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size=0.2, random_state=42)

xgbr_simple = xgb.XGBRegressor(n_estimators=100, max_depth = 5, learning_rate=0.1, objective='reg:squarederror')
xgbr_simple.fit(X_train, y_train)

# Make predictions
y_pred = xgbr_simple.predict(X_test)
r2_simple = r2_score(y_test, y_pred)
mae_simple = mean_absolute_error(y_test, y_pred)

print(f"R^2 (test): {r2_simple:.4f}")
print(f"MAE (test): {mae_simple:.4f}")

R^2 (test): 0.8532
MAE (test): 0.3053


In [14]:
# Checking for overfitting
train_predictions = xgbr_simple.predict(X_train)
r2_train = r2_score(y_train, train_predictions)

print(f"R-squared (R²) on Training Data: {r2_train:.4f}")
print(f"R-squared (R²) on Testing Data:  {r2_simple:.4f}")

R-squared (R²) on Training Data: 0.8666
R-squared (R²) on Testing Data:  0.8532


In [15]:
import joblib
import json

joblib.dump(xgbr_simple, "artifacts/model.joblib")

director_scores_dict = director_scores.to_dict()
writer_scores_dict = writer_scores.to_dict()
actor_scores_dict = actor_scores.to_dict()

with open("artifacts/director_scores.json", "w") as f:
    json.dump(director_scores_dict, f)

with open("artifacts/writer_scores.json", "w") as f:
    json.dump(writer_scores_dict, f)

with open("artifacts/actor_scores.json", "w") as f:
    json.dump(actor_scores_dict, f)

training_genre_columns = [col for col in X_top_features if col not in ['director_score', 'writer_score', 'actor_1_score','actor_2_score','actor_3_score']]
with open("artifacts/genre_columns.json", "w") as f:
    json.dump(training_genre_columns, f)

# Save the exact order of columns the model was trained on
training_columns = list(X_top_features.columns)
with open("artifacts/training_columns.json", "w") as f:
    json.dump(training_columns, f)

print("Training columns saved successfully!")

Training columns saved successfully!
