In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy, math
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
import xgboost as xgb
from xgboost import XGBRegressor
import pickle
import time

df = pd.read_csv('trial_constructed.csv')
# Prepare Data
X = df.drop(columns = ['DSL'])
y = df['DSL']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.1, random_state=200)
# # 转化为数组
X_train = np.array(X_train)
y_train = np.array(y_train)

In [26]:
from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK
def score(params):
    model = XGBRegressor(booster = 'gbtree',
                     learning_rate = 0.12, 
                     n_estimators = 2000, 
                     max_depth = 7, 
                     eta = 0.2, 
                     seed = 8, 
                     subsample = 1, 
                     colsample_bytree = 0.345, 
                     alpha = 0.33,
                     objective = 'reg:squarederror')
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    return score

def optimize(random_state):
    space = {
#              通用参数
             "booster" : 'gbtree', 
             "silent" : 0, # nthread 不管
#              学习目标参数
             "objective" : 'reg:squarederror', 
             "eval_metric" : 'rmse', 
             "seed" : random_state,  
             "learning_rate" : 0.12,  # hp.quniform("learning_rate", 0.01, 0.5, 0.001), 
#              booster参数
             "eta" : hp.quniform('eta', 0.025, 0.5, 0.05),  #0.15
             "min_child_weight" : hp.quniform('min_child_weight', 2, 10, 0.05), 
             "n_estimators" : hp.quniform('n_estimators', 50, 150, 0.05),  # 50,
             "max_depth" :  hp.quniform('max_depth', 3, 50, 0.05),  # 7, 
             "subsample" : hp.quniform('subsample', 0.2, 0.8, 0.05),  # 0.4
             'gamma': hp.quniform('gamma', 0.2, 0.8, 0.01),  # 0.65,  
             "colsample_bytree" : hp.quniform('colsample_bytree', 0.1, 1, 0.05),  # 0.5
             "alpha" : hp.quniform('alpha', 0.2, 0.8, 0.01)  # 0.5 
          }
    best = fmin(score, space, algo = tpe.suggest, max_evals = 400)
    return best

best_params = optimize(200)
print(best_params)

100%|█████| 400/400 [1:09:30<00:00, 10.43s/trial, best loss: 0.9683474256065063]
{'alpha': 0.35000000000000003, 'colsample_bytree': 0.8500000000000001, 'eta': 0.45, 'gamma': 0.67, 'max_depth': 8.85, 'min_child_weight': 3.0500000000000003, 'n_estimators': 132.70000000000002, 'subsample': 0.45}


In [34]:
best_model = XGBRegressor(gamma= 0.67, 
                          max_depth= 9, 
                          min_child_weight= 3.05, 
                          n_estimators= 133, 
                          subsample= 0.45, 
                         objective= 'reg:squarederror', 
                         eval_metric = 'rmse', 
                         seed = 200,  
                         learning_rate = 0.12,
                         eta=0.45,
                          colsample_bytree =0.85, 
                         alpha = 0.35000000000000003
                         )

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X)

def metrics_sklearn(y_valid, y_pred_):
    """模型对验证集和测试集结果的评分"""
    EVS = explained_variance_score(y_valid, y_pred_)
    print("EVS: ", EVS)
    
    R2 = r2_score(y_valid, y_pred_)
    print("R2: ", R2)
    
    MSE = mean_squared_error(y_valid, y_pred_)
    print("MSE: ", MSE)
    
    MAPE=(abs(y_pred -y_valid)/ y_valid).mean()
    print("MAPE: ", MAPE)
    


metrics_sklearn(y, y_pred)

EVS:  0.9367880367602687
R2:  0.9367880330041469
MSE:  0.15080811820819096
MAPE:  0.16307094049694804


### model = XGBRegressor(booster = 'gbtree',  # ✅
                     learning_rate = 0.12, 
                     n_estimators = 2000, 
                     max_depth = 7, 
                     eta = 0.2, 
#                      min_child_weigh = 0.1,  # 这个参数没被用到 暂且忽略
                     seed = 8,  # ✅
                     subsample = 1, 
                     colsample_bytree = 0.345, 
#                      colsample_byleve = 0.5,  # 这个参数没被用到 暂且忽略
                     alpha = 0.33,
                     objective = 'reg:squarederror', 
#                      eval_metricm = 'rsmse',  # 这个参数没被用到 暂且忽略
                    )
model.fit(X_train, y_train)

eval_set = [(X_test, y_test)]

y_pred = model.predict(X_test)
X_test.columns = X_test.columns.astype(str)
print(model.score(X_test, y_test))