In [1]:
import numpy as np
import pickle
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
rand = np.random.RandomState(34215)

In [3]:
data_new = np.load('preprocessed_data_new.npy')

In [4]:
data = data_new

In [5]:
data, data.shape

(array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 1.240e+02, 2.015e+03,
         6.500e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.200e+02, 2.015e+03,
         7.100e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.190e+02, 2.015e+03,
         6.300e+00],
        ...,
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+02, 1.966e+03,
         6.700e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 9.000e+01, 1.966e+03,
         6.100e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 7.400e+01, 1.966e+03,
         1.500e+00]]), (5021, 1524))

In [6]:
X = data[:, :-1]
y = data[:, -1]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
print("Parameter optimization")
xgb_model = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0, objective='reg:squarederror')
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200],
                    'lambda': [1, 2, 3],
                    'alpha': [0, 1, 2],
                    'eta': [0.3, 0.5, 0.7]},
                    cv=3, verbose=2)
clf.fit(X_train, y_train, verbose=False)
print(clf.best_score_)
print(clf.best_params_)

In [36]:
# Best GridSearch score and params
# 0.3274465860125423
# {'alpha': 0, 'eta': 0.3, 'lambda': 1, 'max_depth': 4, 'n_estimators': 200}

In [15]:
actuals = y_test

In [24]:
predictions = clf.best_estimator_.predict(X_test)
print('MSE: {}'.format(mean_squared_error(actuals, predictions)))
print('R2: {}'.format(r2_score(actuals, predictions)))

MSE: 0.5137510044416175
R2: 0.312384126617107


In [8]:
xg_reg = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0, objective='reg:squarederror',
                max_depth = 4, alpha = 0, reg_lambda = 1, eta = 0.3, n_estimators = 200)

In [9]:
xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [12]:
print('MSE: {}'.format(mean_squared_error(actuals, preds)))
print('R2: {}'.format(r2_score(actuals, preds)))

MSE: 0.5137510044416175
R2: 0.312384126617107


In [13]:
print("Pickling model")
pickle.dump(xg_reg, open("best_movies.pkl", "wb"))
clf2 = pickle.load(open("best_movies.pkl", "rb"))

Pickling model
