In [12]:
from xgboost import XGBRegressor
import json
import numpy as np
from sklearn.model_selection import cross_validate

In [7]:
# read data
training_x = list()
with open ("../training_X_with_text_features.json",'r') as data:
    for d in data:
        training_x = json.loads(d)
        
# read data
training_y = list()
with open ("../training_Y_with_text_features.json",'r') as data:
    for d in data:
        training_y = json.loads(d)
        
training_x = np.array(training_x)
training_y = np.array(training_y)

print(training_x.shape)
print(training_y.shape)

(1237673, 20)
(1237673,)


In [21]:
# select 20% of the data to do CV to tune parameters
part_training_x = training_x[:round(0.2*(training_x.shape[0])),:]
part_training_y = training_y[:round(0.2*(training_x.shape[0]))]

In [11]:
# we do not do feature selection for xgboost
# because xgboost itself will find important/unimportant features
# we set threshold as a parameter to select features

In [31]:
# we use grid search to tune the hyperparameters
# hyper parameters list

max_depth_list = [2,4,6,8,10]
n_estimators_list = [*range(100, 600, 100)]
learning_rate_list = [0.05, 0.01]
colsample_bytree_list = [0.6,0.8,1]

In [34]:
results_dict = dict()
for a in max_depth_list:
    for b in n_estimators_list:
        for c in learning_rate_list:
            for d in colsample_bytree_list:
                
                xgboost = XGBRegressor(objective='reg:squarederror', booster='gblinear', n_jobs=-1, 
                                       reg_alpha = 0.5, reg_lambda=0.5, radom_state=2020,
                                       max_depth=a, n_estimators=b, learning_rate=c, colsample_bytree=d)

                cv_results = cross_validate(xgboost, part_training_x, part_training_y, cv=3, scoring=('neg_mean_squared_error'))
                results = -cv_results['test_score']
                key = "{},{},{},{}".format(a,b,c,d)
                results_dict[key] = round(np.mean(results),5)
                print(round(np.mean(results),3), "{},{},{},{}".format(a,b,c,d))

1.159 2,100,0.05,0.6
1.159 2,100,0.05,0.8
1.159 2,100,0.05,1
1.188 2,100,0.01,0.6
1.188 2,100,0.01,0.8
1.188 2,100,0.01,1
1.159 2,200,0.05,0.6
1.159 2,200,0.05,0.8
1.159 2,200,0.05,1
1.173 2,200,0.01,0.6
1.173 2,200,0.01,0.8
1.173 2,200,0.01,1
1.163 2,300,0.05,0.6
1.163 2,300,0.05,0.8
1.163 2,300,0.05,1
1.166 2,300,0.01,0.6
1.166 2,300,0.01,0.8
1.166 2,300,0.01,1
1.169 2,400,0.05,0.6
1.169 2,400,0.05,0.8
1.169 2,400,0.05,1
1.163 2,400,0.01,0.6
1.163 2,400,0.01,0.8
1.163 2,400,0.01,1
1.177 2,500,0.05,0.6
1.177 2,500,0.05,0.8
1.177 2,500,0.05,1
1.161 2,500,0.01,0.6
1.161 2,500,0.01,0.8
1.161 2,500,0.01,1
1.159 4,100,0.05,0.6
1.159 4,100,0.05,0.8
1.159 4,100,0.05,1
1.188 4,100,0.01,0.6
1.188 4,100,0.01,0.8
1.188 4,100,0.01,1
1.159 4,200,0.05,0.6
1.159 4,200,0.05,0.8
1.159 4,200,0.05,1
1.173 4,200,0.01,0.6
1.173 4,200,0.01,0.8
1.173 4,200,0.01,1
1.163 4,300,0.05,0.6
1.163 4,300,0.05,0.8
1.163 4,300,0.05,1
1.166 4,300,0.01,0.6
1.166 4,300,0.01,0.8
1.166 4,300,0.01,1
1.169 4,400,0.05,0.6
1.1

In [42]:
sorted_results = sorted(results_dict.items(),key = lambda x:x[0], reverse = False)

In [47]:
sorted_results[:10]

[(1.1587, '10,200,0.05,1'),
 (1.15901, '10,100,0.05,1'),
 (1.16089, '10,500,0.01,1'),
 (1.16271, '10,400,0.01,1'),
 (1.16318, '10,300,0.05,1'),
 (1.16607, '10,300,0.01,1'),
 (1.16939, '10,400,0.05,1'),
 (1.17293, '10,200,0.01,1'),
 (1.17687, '10,500,0.05,1'),
 (1.18777, '10,100,0.01,1')]

In [54]:
# it shows that max_depth should be = 10, colsample_bytree_list should be = 1
# learning rate seems does not matter
# tree number = 200 is locally optimial

In [56]:
# re-tune
max_depth_list = [8,10,12,14,20] # add 20 to see whether the deeper the tree is, the better the result is
n_estimators_list = [150,200,250,300,350,400]
learning_rate_list = [0.01,0.05] # 0.05 and 0.01 do not differ too much
colsample_bytree_list = [1] # should be 1

In [None]:
results_dict = dict()
for a in max_depth_list:
    for b in n_estimators_list:
        for c in learning_rate_list:
            for d in colsample_bytree_list:
                
                xgboost = XGBRegressor(objective='reg:squarederror', booster='gblinear', n_jobs=-1, 
                                       reg_alpha = 0.5, reg_lambda=0.5, radom_state=2020,
                                       max_depth=a, n_estimators=b, learning_rate=c, colsample_bytree=d)

                cv_results = cross_validate(xgboost, part_training_x, part_training_y, cv=3, scoring=('neg_mean_squared_error'))
                results = -cv_results['test_score']
                key = "{},{},{},{}".format(a,b,c,d)
                results_dict[key] = round(np.mean(results),5)
                print(round(np.mean(results),3), "{},{},{},{}".format(a,b,c,d))

1.179 8,150,0.01,1
1.158 8,150,0.05,1
1.173 8,200,0.01,1
1.159 8,200,0.05,1
1.169 8,250,0.01,1
1.161 8,250,0.05,1
1.166 8,300,0.01,1
1.163 8,300,0.05,1
1.164 8,350,0.01,1
1.166 8,350,0.05,1
1.163 8,400,0.01,1
1.169 8,400,0.05,1
1.179 10,150,0.01,1
1.158 10,150,0.05,1
1.173 10,200,0.01,1
1.159 10,200,0.05,1
1.169 10,250,0.01,1
1.161 10,250,0.05,1
1.166 10,300,0.01,1
1.163 10,300,0.05,1
1.164 10,350,0.01,1
1.166 10,350,0.05,1
1.163 10,400,0.01,1
1.169 10,400,0.05,1
1.179 12,150,0.01,1
1.158 12,150,0.05,1
1.173 12,200,0.01,1
1.159 12,200,0.05,1
1.169 12,250,0.01,1
1.161 12,250,0.05,1
1.166 12,300,0.01,1
1.163 12,300,0.05,1
1.164 12,350,0.01,1
1.166 12,350,0.05,1
1.163 12,400,0.01,1
1.169 12,400,0.05,1
1.179 14,150,0.01,1
1.158 14,150,0.05,1
1.173 14,200,0.01,1
1.159 14,200,0.05,1
1.169 14,250,0.01,1


In [None]:
sorted_results = sorted(results_dict.items(),key = lambda x:x[1], reverse = False)
sorted_results[:10]

In [None]:
# read data
testing_x = list()
with open ("../testing_X_with_text_features.json",'r') as data:
    for d in data:
        testing_x = json.loads(d)
        
# read data
testing_y = list()
with open ("../testing_Y_with_text_features.json",'r') as data:
    for d in data:
        testing_y = json.loads(d)
        
testing_x = np.array(testing_x)
testing_y = np.array(testing_y)

print(testing_x.shape)
print(testing_y.shape)