In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [346]:
data = pd.read_csv('all_features_for_hp_tuning.csv')

In [368]:
data = data.set_index('reader_id')

In [374]:
X, y = data.iloc[:,1:], data.rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

eval_set=[(X_test,y_test)]

train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
test_dmatrix = xgb.DMatrix(data=X_test, label=y_test)
val_dmatrix = xgb.DMatrix(data=X_val, label=y_val)

In [367]:
X_train.head()

Unnamed: 0,rating,Fiction percentile,Young Adult percentile,Romance percentile,Contemporary percentile,Fantasy percentile,Adult percentile,Adult Fiction percentile,Mystery percentile,Adventure percentile,...,Childrens > School Stories median,Russian History > Romanovs median,Cultural > Tasmania median,Horticulture > Gardening median,Environment > Sustainability median,Planets > Earth median,reader_avg_publ_year,reader_avg_rating,reader_avg_book_length,reader_avg_rating bias
72,1,0.23252,0.226464,0.063811,0.169328,0.29941,0.228703,0.160633,0.23201,0.287262,...,,,,,,,2017.398496,3.37,4.07255,-0.70255
1069,0,0.494785,0.479084,0.470585,0.543511,0.470928,0.608321,0.606842,0.433862,0.445361,...,,,,,,,2008.549223,3.964824,4.08,-0.115176
2757,0,0.404011,0.37645,0.384757,0.390043,0.408072,0.424516,0.483406,0.489192,0.326564,...,,,,,,,2016.875,3.305,4.00525,-0.70025
869,0,0.295915,0.326615,0.257629,0.303698,0.246745,0.205315,0.185432,0.347345,0.284539,...,,,,,,,2003.881356,3.066667,3.9595,-0.892833
394,1,0.284087,0.343765,0.22645,0.219465,0.4473,0.261251,0.218397,0.322972,0.435157,...,,,,,,,2015.741379,3.055,4.0093,-0.9543


In [None]:
params = ({'n_estimators' = 30000, 'max_depth' = 6, 'objective' = 'binary:logistic', 'eta'=.05,
           'subsample'=.8, 'min_child_weight'= 3, 'colsample_bytree' = .8, 'use_label_encoder'= False})

fit_params = {'eval_set' = eval_set, 'eval_metric' = 'auc', 'early_stopping_rounds' = 50}

#### 10-fold CV, iterating to find best parameters

Tree depth

In [232]:
params = {"objective":"binary:logistic", 'seed': 123}

max_depths = [3, 4, 5, 6, 7]
best_auc = []
best_logloss = []
best_error = []

for curr_val in max_depths:
    params["max_depth"] = curr_val
    cv_results = (xgb.cv(dtrain=data_dmatrix, params=params, nfold=5, early_stopping_rounds=50,
                         num_boost_round=1200,metrics=['error','aucpr','logloss'], seed=123, as_pandas=True))
    best_auc.append(cv_results["test-aucpr-mean"].tail().values[-1])
    best_error.append(cv_results["test-error-mean"].tail().values[-1])
    best_logloss.append(cv_results["test-logloss-mean"].tail().values[-1])

In [233]:
max_depths_df = (pd.DataFrame(list(zip(max_depths, best_auc, best_logloss, best_error)),
                              columns=["max_depth", 'auc', "logloss", 'error']))
max_depths_df

Unnamed: 0,max_depth,auc,logloss,error
0,3,0.671061,0.579673,0.313304
1,4,0.664397,0.584798,0.322262
2,5,0.650999,0.589327,0.317785
3,6,0.653159,0.597488,0.334959
4,7,0.630905,0.606951,0.341679


Learning Rate

In [227]:
#best_auc = []
#etas = []

In [236]:
params = ({'objective':'binary:logistic', 'max_depth': 5, 'seed':123})
new_etas = [.01]
for curr_value in new_etas:
    etas.append(curr_value)
    params["learning_rate"] = curr_value
    cv_results = (xgb.cv(dtrain=data_dmatrix, params=params, nfold=4, early_stopping_rounds=45,
                         num_boost_round=1200,metrics='auc', seed=123, as_pandas=True))
    best_auc.append(cv_results["test-auc-mean"].tail().values[-1])

best_etas = (pd.DataFrame(list(zip(etas, best_auc)),columns=["eta","best_auc"]))
best_etas

Unnamed: 0,eta,best_auc
0,0.05,0.671061
1,0.15,0.664397
2,1.0,0.650999
3,0.1,0.653159
4,0.01,0.630905


Regularization

In [139]:
alphas = []
best_alpha_auc = []

In [140]:
params = ({'objective':'binary:logistic', 'max_depth': 4, 'seed':123, 'eta':.094, })
new_alphas = [.001, .01, .1]
for curr_value in new_alphas:
    alphas.append(curr_value)
    params["learning_rate"] = curr_value
    cv_results = (xgb.cv(dtrain=data_dmatrix, params=params, nfold=4, early_stopping_rounds=45,
                         num_boost_round=1200,metrics='auc', seed=123, as_pandas=True))
    best_alpha_auc.append(cv_results["test-auc-mean"].tail().values[-1])

best_alphas = (pd.DataFrame(list(zip(alphas, best_alpha_auc)),columns=["alpha","best_auc"]))

Randomized Search to narrow down. Metric: Precision.

In [237]:
from sklearn.model_selection import RandomizedSearchCV

In [396]:
gbm_param_grid = {'learning_rate': np.arange(.01,.16, .01), 'max_depth': [4,5], 'alpha': [.01, .05, .1]}

gbm = xgb.XGBClassifier(n_estimators=200, use_label_encoder=False)

randomized_auc = (RandomizedSearchCV(estimator=gbm, param_distributions = gbm_param_grid, n_iter=20,
                                     scoring='precision', cv=10, verbose=1))
randomized_auc.fit(X_train_test, y_train_test)

SyntaxError: invalid syntax (3051578420.py, line 3)

In [357]:
print("Best parameters found: ", randomized_auc.best_params_)
print("Best precision score: ",(randomized_auc.best_score_))

Best parameters found:  {'max_depth': 4, 'learning_rate': 0.06299999999999996, 'alpha': 0.01}
Best precision score:  1.0


In [337]:
y_pred = (randomized_auc.predict_proba(X_val)[:,1]>0.5)
print(precision_score(y_val, y_pred))

1.0


In [360]:
y_pred = (randomized_auc.predict_proba(X_test)[:,1]>0.5)
print(precision_score(y_test, y_test))

1.0


In [345]:
confusion_matrix(y_val, randomized_auc.predict(X_val))

array([[342,   0],
       [  0, 267]])

In [339]:
y_pred = (randomized_auc.predict_proba(X_test)[:,1]>0.5)
print(precision_score(y_test, y_pred)) #I had to see :o

1.0


#### Final Model

# Gridsearch

In [123]:
from sklearn.model_selection import GridSearchCV

In [361]:
gbm_param_grid = ({'learning_rate':[0.013999999999999997, 0.013999999999999997],'alpha':[.1,0.01], 'max_depth': [4]})

gbm = xgb.XGBClassifier(n_estimators=200, objective='binary:logistic', use_label_encoder=False)

grid_gbm = (GridSearchCV(estimator = gbm, param_grid = gbm_param_grid, scoring = 'precision', cv = 10, verbose = 1))
grid_gbm.fit(X_train, y_test)

Fitting 10 folds for each of 4 candidates, totalling 40 fits




GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weigh

In [380]:
grid_gbm.best_params_

{'alpha': 0.1, 'learning_rate': 0.013999999999999997, 'max_depth': 4}

In [350]:
grid_gbm.best_params_

{'alpha': 0.1, 'learning_rate': 0.1, 'max_depth': 5}

In [None]:
grid_gbm.best_params_

In [351]:
grid_gbm.best_score_

0.635766797857315

In [388]:
X_test.shape

(609, 886)

In [389]:
X_train.shape

(1826, 886)

#### Final Model

In [292]:
y_pred = (grid_gbm.predict_proba(X_val)[:,1]>0.9)
print(precision_score(y_val, y_pred))

0.8833333333333333


In [None]:
y_pred = (grid_gbm.predict_proba(X_)[:,1]>0.5)
print(precision_score(y_test, y_pred))

Test regularization

In [None]:
xgb.cv(dtrain=data_dmatrix params=params, nfold=4, early_stopping_rounds=25,
                         num_boost_round=1000,metrics='auc', seed=123, as_pandas=True)

#### Old

In [32]:
# Doesn't really make sense to use colsample_bytree
params={"objective":"binary:logistic","max_depth":4}

colsample_bytree_vals = [.1, .5, .8, 1]
best_rmse = []

for curr_val in colsample_bytree_vals:
    params['colsample_bytree'] = curr_val
    cv_results = (xgb.cv(dtrain=data_dmatrix, params=params, nfold=2, num_boost_round=250,
                         early_stopping_rounds=20,metrics="auc", as_pandas=True, seed=123))
    best_rmse.append(cv_results["test-auc-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_auc"]))

   colsample_bytree  best_auc
0               0.1  0.735700
1               0.5  0.737626
2               0.8  0.737779
3               1.0  0.738356


In [None]:
params = {'learning_rate':.1,'alpha':.1, 'max_depth': 5, 'use_label_encoder':False, 'verbose':1,'num_boost_round':3000}

#gbm = (xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, learning_rate = .1, alpha = .1,
#                         , max_depth = 5, verbose = 1, n_estimators=3000)

gbm_cv = (xgb.cv(dtrain=train_dmatrix, params=params, metrics = 'aucpr', nfold = 10, early_stopping_rounds=50))


#xgb.cv(dtrain=data_dmatrix params=params, nfold=4, early_stopping_rounds=25,
#                         num_boost_round=1000,metrics='auc', seed=123, as_pandas=True)

In [None]:
f1_score()