<a href="https://colab.research.google.com/github/NBK-code/IMDb-Rating-Prediction/blob/main/IMDB_mrp_Hyperparameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error as mae

In [3]:
X_train = pd.read_csv('/content/X_train_scaled.csv')
X_val = pd.read_csv('/content/X_val_scaled.csv')
X_test = pd.read_csv('/content/X_test_scaled.csv')

In [4]:
y_train = pd.read_csv('/content/y_train_scaled.csv')
y_val = pd.read_csv('/content/y_val_scaled.csv')
y_test = pd.read_csv('/content/y_test_scaled.csv')

In [5]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((3727, 43), (466, 43), (466, 43), (3727, 2), (466, 2), (466, 2))

In [6]:
X_train.drop(columns=['Unnamed: 0'], inplace=True)
X_test.drop(columns=['Unnamed: 0'], inplace=True)
X_val.drop(columns=['Unnamed: 0'], inplace=True)

In [7]:
y_train.drop(columns=['Unnamed: 0'], inplace=True)
y_test.drop(columns=['Unnamed: 0'], inplace=True)
y_val.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
X_val_np = X_val.to_numpy()

In [9]:
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()
y_val_np = y_val.to_numpy()

In [10]:
X_train_np.shape, X_test_np.shape, X_val_np.shape

((3727, 42), (466, 42), (466, 42))

In [11]:
y_train_np.shape, y_test_np.shape, y_val_np.shape

((3727, 1), (466, 1), (466, 1))

In [12]:
y_train_np = y_train_np.flatten()
y_test_np = y_test_np.flatten()
y_val_np = y_val_np.flatten()

In [13]:
y_train_np.shape, y_test_np.shape, y_val_np.shape

((3727,), (466,), (466,))

In [15]:
xgb_regressor=XGBRegressor(n_estimators=100, max_depth = 6, verbosity = 0)

In [18]:
#Reproducing Previous Result

xgb_regressor.fit(X_train_np,y_train_np)
predictions = xgb_regressor.predict(X_test_np)
errors = abs(predictions - y_test_np)
mape = 100 * (errors / y_test_np)
accuracy = 100 - np.mean(mape)
print('Test Accuracy:', round(accuracy, 2), '%')
print('Test error:', round(mae(y_val_np, xgb_regressor.predict(X_test_np)),5))

Test Accuracy: 89.77 %
Test error: 1.00444


In [22]:
## Hyper Parameter Optimization
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [None]:
xgb_regressor = XGBRegressor()

In [24]:
# Set up the random search with 5-fold cross validation
random_cv = RandomizedSearchCV(estimator=xgb_regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [25]:
random_cv.fit(X_train_np,y_train_np)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5, estimator=XGBRegressor(max_depth=6, verbosity=0),
                   n_iter=50, n_jobs=4,
                   param_distributions={'base_score': [0.25, 0.5, 0.75, 1],
                                        'booster': ['gbtree', 'gblinear'],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2],
                                        'max_depth': [2, 3, 5, 10, 15],
                                        'min_child_weight': [1, 2, 3, 4],
                                        'n_estimators': [100, 500, 900, 1100,
                                                         1500]},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_absolute_error', verbose=5)

In [26]:
random_cv.best_estimator_

XGBRegressor(learning_rate=0.05, max_depth=15, min_child_weight=4,
             n_estimators=900, verbosity=0)

In [27]:
xgb_regressor = XGBRegressor(learning_rate=0.05, 
                             max_depth=15, min_child_weight=4,
                             n_estimators=900, verbosity=0)

In [28]:
xgb_regressor.fit(X_train_np,y_train_np)
predictions = xgb_regressor.predict(X_test_np)
errors = abs(predictions - y_test_np)
mape = 100 * (errors / y_test_np)
accuracy = 100 - np.mean(mape)
print('Test Accuracy:', round(accuracy, 2), '%')
print('Test error:', round(mae(y_val_np, xgb_regressor.predict(X_test_np)),5))

Test Accuracy: 90.12 %
Test error: 1.01132
