In [13]:
import numpy as np
import pickle
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
rand = np.random.RandomState(34215)

In [4]:
data = np.load('preprocessed_data.npy')

In [5]:
data, data.shape

(array([[   0. ,    0. ,    0. , ...,  162. , 2009. ,    7.2],
        [   0. ,    0. ,    0. , ...,  169. , 2007. ,    6.9],
        [   0. ,    0. ,    0. , ...,  148. , 2015. ,    6.3],
        ...,
        [   0. ,    0. ,    0. , ...,  111. , 1997. ,    7.4],
        [   0. ,    0. ,    0. , ...,   77. , 2004. ,    6.9],
        [   0. ,    0. ,    0. , ...,   81. , 1992. ,    6.6]]), (3517, 8262))

In [6]:
X = data[:, :-1]
y = data[:, -1]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [11]:
print("Parameter optimization")
xgb_model = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0, objective='reg:squarederror')
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, cv=3, verbose=2)
clf.fit(X_train,y_train, verbose=False)
print(clf.best_score_)
print(clf.best_params_)

Parameter optimization
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... max_depth=2, n_estimators=50, total=   4.3s
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV] ..................... max_depth=2, n_estimators=50, total=   5.1s
[CV] max_depth=2, n_estimators=50 ....................................
[CV] ..................... max_depth=2, n_estimators=50, total=   4.2s
[CV] max_depth=2, n_estimators=100 ...................................
[CV] .................... max_depth=2, n_estimators=100, total=   5.2s
[CV] max_depth=2, n_estimators=100 ...................................
[CV] .................... max_depth=2, n_estimators=100, total=   4.6s
[CV] max_depth=2, n_estimators=100 ...................................
[CV] .................... max_depth=2, n_estimators=100, total=   5.0s
[CV] max_depth=2, n_estimators=200 ...................................
[CV] .................... max_depth=2, n_estimators=200, total=   5.9s
[CV] max_depth=2, n_estimators=200 ...................................
[CV] .................... max_depth=2, n_estimators=200, total=   5.8s
[CV] max_depth=2, n_estimators=200 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  3.3min finished


0.23671996367546547
{'max_depth': 4, 'n_estimators': 100}


In [14]:
predictions = clf.best_estimator_.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print(r2_score(actuals, predictions))

0.5945348632837225
0.2680816295703452


In [None]:
# print("Pickling model")
# pickle.dump(clf, open("best_movies.pkl", "wb"))
# clf2 = pickle.load(open("best_movies.pkl", "rb"))