# 🎬 TMDB XGBoost Modeling Notebook

This notebook replicates and builds upon the Exploratory Data Analysis by training and evaluating XGBoost models for both classification and regression.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
import shap

In [None]:
# Load the dataset
df = pd.read_csv('Dataset.csv')

# Feature Engineering (from EDA)
df['budget_log'] = np.log1p(df['budget'])
df['revenue_log'] = np.log1p(df['revenue'])
df['roi'] = (df['revenue'] - df['budget']) / (df['budget'] + 1e-6)

# Classification label based on vote_average
# Create classification labels from vote_average
def classify_score(v):
    if v >= 7.5:
        return 'High'
    elif v >= 5.5:
        return 'Medium'
    else:
        return 'Low'

# Apply label and filter missing values
df = df[df['vote_average'].notna()].copy()
df['score_class'] = df['vote_average'].apply(classify_score)

# Encode labels numerically for XGBoost
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['score_class_encoded'] = le.fit_transform(df['score_class'])

# Define features and target
features = ['budget_log', 'revenue_log', 'popularity', 'runtime', 'roi']
X = df[features]
y_class = df['score_class_encoded']

# Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, stratify=y_class, random_state=42
)


In [4]:
# Define features and targets
features = ['budget_log', 'revenue_log', 'popularity', 'runtime', 'roi']
X = df[features]
y_class = df['score_class']
y_reg = df['vote_average']

In [5]:
# Classification: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, stratify=y_class, random_state=42)

In [None]:



# Train XGBoost Classifier
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5,
                   subsample=0.8, colsample_bytree=0.8,
                   random_state=42, use_label_encoder=False,
                   eval_metric='mlogloss')
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Tune Classifier with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
                    param_grid, cv=3, verbose=1, n_jobs=-1)


In [None]:
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

In [None]:
# SHAP and Feature Importance for Classifier
xgb.plot_importance(clf)
plt.tight_layout()
plt.show()

explainer = shap.Explainer(clf)
shap_values = explainer(X_test)
shap.plots.beeswarm(shap_values)

In [None]:
# Regression: Train/Test Split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost Regressor
reg = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5,
                  subsample=0.8, colsample_bytree=0.8,
                  random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

print('MSE:', mean_squared_error(y_test_reg, y_pred_reg))
print('R² Score:', r2_score(y_test_reg, y_pred_reg))

In [None]:
# SHAP and Feature Importance for Regressor
xgb.plot_importance(reg)
plt.tight_layout()
plt.show()

explainer_reg = shap.Explainer(reg)
shap_values_reg = explainer_reg(X_test_reg)
shap.plots.beeswarm(shap_values_reg)