In [18]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostRegressor

In [2]:
dataset = pd.read_csv('ML_training&testing_v01shuffled_20220317.csv')

In [3]:
X = dataset.iloc[:, :-1].values.astype(float)
y = dataset.iloc[:, -1].values.astype(float)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0,shuffle=True)

In [10]:
%%time
scoring = {'r': make_scorer(lambda x,y: np.corrcoef(x, y)[0, 1]),
           'r2': 'r2',
           'rmse_neg': 'neg_root_mean_squared_error'}

regressor = xgb.XGBRegressor(learning_rate=0.1,n_estimators=800,max_depth=10,min_child_weight=1,seed=0,
subsample= 0.8, colsample_bytree= 0.9, gamma= 0, reg_alpha= 0.05, reg_lambda= 0.1)

scores = cross_validate(regressor, X_train, y_train, scoring=scoring,
                         cv=5, return_train_score=True)

CPU times: user 2h 25min 45s, sys: 2min 15s, total: 2h 28min 1s
Wall time: 27min 41s


In [11]:
scores['test_r2'].mean(), scores['test_r2'].std()

(0.908100282614677, 0.0008103944226280174)

In [12]:
scores['test_r'].mean(), scores['test_r'].std()

(0.9532703232686893, 0.0004406202326859871)

In [13]:
scores['test_rmse_neg'].mean(), scores['test_rmse_neg'].std()

(-0.03374899762040335, 0.00010799526583814949)

In [19]:
%%time
scoring = {'r': make_scorer(lambda x,y: np.corrcoef(x, y)[0, 1]),
           'r2': 'r2',
           'rmse_neg': 'neg_root_mean_squared_error'}

regressor = AdaBoostRegressor(n_estimators=30,learning_rate=0.2)

scores = cross_validate(regressor, X_train, y_train, scoring=scoring,
                         cv=5, return_train_score=True)

CPU times: user 4min 17s, sys: 218 ms, total: 4min 17s
Wall time: 4min 19s


In [20]:
scores['test_r2'].mean(), scores['test_r2'].std()

(0.38007484291879273, 0.0037144177795472256)

In [21]:
scores['test_r'].mean(), scores['test_r'].std()

(0.6403247674555999, 0.003184983915496707)

In [22]:
scores['test_rmse_neg'].mean(), scores['test_rmse_neg'].std()

(-0.08765494710350161, 0.00015075505379607543)

In [18]:
%%time
cv_params = {'n_estimators': [400, 500, 600, 700, 800]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
evaluate_result:{'mean_fit_time': array([174.47059374, 215.78626542, 258.8593482 , 295.9017159 ,
       295.02599711]), 'std_fit_time': array([ 2.34249839,  2.24922608,  2.21441193,  0.4988847 , 67.84198006]), 'mean_score_time': array([3.17593322, 0.46788936, 0.56297545, 0.66914606, 0.53780804]), 'std_score_time': array([1.39173924, 0.02893735, 0.045661  , 0.00964764, 0.14579523]), 'param_n_estimators': masked_array(data=[400, 500, 600, 700, 800],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 400}, {'n_estimators': 500}, {'n_estimators': 600}, {'n_estimators': 700}, {'n_estimators': 800}], 'split0_test_score': array([0.78812302, 0.8013698 , 0.81194738, 0.81955279, 0.8262796 ]), 'split1_test_score': array([0.78737557, 0.80206346, 0.81252847, 0.82062925, 0.82766354]), 'split2_test_score': array([0.79069823, 0.80314561, 0.81383336, 0.82151432, 

In [8]:
%%time
cv_params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6]}
other_params = {'learning_rate': 0.1, 'n_estimators': 800, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
evaluate_result:{'mean_fit_time': array([195.3110312 , 195.01114993, 192.95153985, 197.93856416,
       194.59731455, 194.85558133, 268.83795061, 266.81230855,
       263.69005132, 264.50050454, 262.87595844, 264.10332828,
       346.01773963, 342.28062954, 343.07502084, 341.59097018,
       341.06043811, 342.68316112, 437.46394467, 436.07633405,
       436.37340693, 434.77769675, 434.05905623, 430.37027988,
       529.63023114, 529.79420896, 530.01525736, 531.553297  ,
       530.69241991, 530.00302939, 633.33065228, 630.36922698,
       628.76131606, 626.69009814, 625.75123386, 627.02690129,
       727.48487506, 744.53311458, 737.98698373, 746.696668  ,
       748.5668694 , 743.12023559, 841.97388535, 849.62471504,
       862.76285563, 857.59481039, 854.14229846, 759.03053923]), 'std_fit_time': array([ 0.77586504,  3.29757951,  1.84492992,  1.94440273,  1.43534026,
        2.25902696,  2.11348199,  1.20775201,  2.13026761,

In [12]:
%%time
cv_params = {'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
other_params = {'learning_rate': 0.1, 'n_estimators': 800, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
evaluate_result:{'mean_fit_time': array([834.20582318, 840.0803277 , 840.13491845, 832.78077598,
       799.09371943, 666.71996121]), 'std_fit_time': array([  6.78669807,   5.36679596,   7.21235411,   7.36276416,
        12.35615122, 161.19475122]), 'mean_score_time': array([1.30405364, 3.03747172, 0.39238124, 1.19151778, 0.32172174,
       0.22241144]), 'std_score_time': array([1.71406574, 2.0675925 , 0.05669072, 1.67747082, 0.03790409,
       0.06033702]), 'param_gamma': masked_array(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'gamma': 0.1}, {'gamma': 0.2}, {'gamma': 0.3}, {'gamma': 0.4}, {'gamma': 0.5}, {'gamma': 0.6}], 'split0_test_score': array([0.84487918, 0.81956313, 0.80347389, 0.79017627, 0.77775908,
       0.76688937]), 'split1_test_score': array([0.84545913, 0.81926841, 0.80327296, 0.78784352, 0.77833

In [13]:
%%time
cv_params = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
other_params = {'learning_rate': 0.1, 'n_estimators': 800, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
evaluate_result:{'mean_fit_time': array([669.81272564, 679.48470135, 684.83110919, 686.23252354,
       746.79390645, 747.8973084 , 748.99171615, 757.34608078,
       820.05911074, 831.81295886, 835.26497188, 842.02583165,
       905.43070564, 916.36027265, 924.17239633, 917.11101918]), 'std_fit_time': array([ 1.37138034,  1.65669715,  1.43305633,  2.68244684,  5.83303192,
        7.35452802,  1.31304925,  1.34956008,  1.38212052,  4.57182773,
        0.24297516,  2.05935417,  1.37536743,  2.0229815 ,  1.66410734,
       18.1864936 ]), 'mean_score_time': array([5.77393942, 2.7194375 , 2.45980887, 2.26659584, 2.45935998,
       2.4815486 , 2.30946836, 2.27465596, 2.55748935, 2.39218082,
       2.37468758, 2.35758972, 2.64277697, 2.50050726, 2.51507506,
       1.77851081]), 'std_score_time': array([1.81370744, 0.53454183, 0.19528951, 0.09412631, 0.12307982,
       0.06746561, 0.08992808, 0.07165477, 0.06811205, 0.06033808,
    

In [14]:
%%time
cv_params = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
other_params = {'learning_rate': 0.1, 'n_estimators': 800, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.9, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
evaluate_result:{'mean_fit_time': array([963.78407011, 962.91913238, 952.37100286, 944.07602506,
       929.94381733, 937.08642583, 936.67912221, 941.68685112,
       937.74370127, 942.30425439, 955.16060529, 959.46948643,
       958.87249165, 960.1522552 , 959.89711213, 945.52697921,
       952.02182794, 954.26243639, 944.13528347, 949.74453073,
       929.67047195, 926.65196261, 923.84274578, 921.69575057,
       792.75609674]), 'std_fit_time': array([  2.15347271,   4.2326034 ,  10.26830933,  12.10382958,
         2.11086313,   1.17873668,   1.28368821,   8.69360446,
         1.44710432,   7.94982955,   6.00302744,   4.45533568,
         3.07943542,   7.22959189,   6.52450221,   4.01540604,
         5.54177902,   6.38350128,   3.56355452,   6.0052617 ,
         8.29385502,   7.98035999,   2.02194728,   1.93831886,
       247.53812565]), 'mean_score_time': array([5.75306993, 2.64457874, 2.78064542, 2.61437964, 2.66133232,


In [15]:
%%time
cv_params = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2]}
other_params = {'learning_rate': 0.1, 'n_estimators': 800, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.9, 'gamma': 0, 'reg_alpha': 0.05, 'reg_lambda': 0.1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
evaluate_result:{'mean_fit_time': array([936.41231818, 947.69680576, 937.09545789, 931.75016794,
       808.54696078]), 'std_fit_time': array([  5.15337111,  12.52292384,  10.12724183,   7.56702251,
       249.19374259]), 'mean_score_time': array([7.54961801, 3.51073976, 2.76166825, 2.57214832, 2.0479569 ]), 'std_score_time': array([1.93637684, 0.55853819, 0.59567274, 0.10288739, 0.66090822]), 'param_learning_rate': masked_array(data=[0.01, 0.05, 0.07, 0.1, 0.2],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'learning_rate': 0.01}, {'learning_rate': 0.05}, {'learning_rate': 0.07}, {'learning_rate': 0.1}, {'learning_rate': 0.2}], 'split0_test_score': array([0.8369078 , 0.9007358 , 0.90488363, 0.90712264, 0.90519248]), 'split1_test_score': array([0.83640814, 0.90067693, 0.90531976, 0.90814674, 0.90359468]), 'split2_test_score': array([0.83968509, 0.90172364, 

In [16]:
%%time
cv_params = {'n_estimators': [650, 725, 800, 875, 950]}
other_params = {'learning_rate': 0.1, 'n_estimators': 800, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.9, 'gamma': 0, 'reg_alpha': 0.05, 'reg_lambda': 0.1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('evaluate_result:{0}'.format(evalute_result))
print('best_param：{0}'.format(optimized_GBM.best_params_))
print('best_score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
evaluate_result:{'mean_fit_time': array([ 777.93094192,  865.19423723,  945.69795704, 1035.59795475,
        983.19041643]), 'std_fit_time': array([  1.97437102,   9.64126279,   8.88739014,   9.58400148,
       248.72566499]), 'mean_score_time': array([5.43432469, 2.35194683, 3.71925111, 5.68573766, 2.43204679]), 'std_score_time': array([1.63712581, 0.15703752, 1.91956085, 2.2098727 , 0.77381487]), 'param_n_estimators': masked_array(data=[650, 725, 800, 875, 950],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 650}, {'n_estimators': 725}, {'n_estimators': 800}, {'n_estimators': 875}, {'n_estimators': 950}], 'split0_test_score': array([0.90406088, 0.90582005, 0.90712264, 0.90838759, 0.9094504 ]), 'split1_test_score': array([0.90495751, 0.90649239, 0.90814674, 0.90941019, 0.91037554]), 'split2_test_score': array([0.90623277, 0.90793044, 0.90953

In [9]:
xgb_reg = xgb.XGBRegressor('learning_rate': 0.1, 'n_estimators': 400, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1)

In [10]:
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [11]:
y_pred = xgb_reg.predict(X_test)

In [12]:
r2score= r2_score(y_test, y_pred)
print('R2score is',r2score )
MAE=mean_absolute_error(y_test, y_pred)
print('MAE is',MAE)
MSE=mean_squared_error(y_test, y_pred)
print('MSE is',MSE)

R2score is 0.8318714674220904
MAE is 0.020981251245801788
MSE is 0.0008497341675312962
