In [None]:
# 探索n_estimators对模型的影响
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time 
import datetime


In [None]:
data = load_boston()
data.data.shape # 506, 13
data.target.shape # 506, 
X = data.data
y = data.target

In [None]:
x_train, x_valid, y_train, y_valid = TTS(X, y, test_size = 0.3, random_state=420)

In [None]:
reg = XGBR(n_estimators=100).fit(x_train, y_train)
reg.predict(x_valid)
reg.score(x_valid, y_valid) # 返回的指标是r^2,和其他树的返回是一样的


In [None]:
MSE(y_valid, reg.predict(x_valid)) # 均方误差是7.46， 不好不坏
reg.feature_importances_ #树模型的优势之一，能够查看模型的重要性分数，可以使用嵌入法(SelectFromModel)进行特征


In [None]:
reg = XGBR(n_estimators=100) # 交叉验证中要导入没有经过训练的模型
CVS(reg, x_train, y_train, cv=5).mean()
# 这里应该返回的是什么指标呢？score返回的就是r^2，和score返回的模型指标是相同的
# 交叉验证中，使用的是全数据集还是训练集呢
# 严谨或者不严谨 加入放入的是全部的训练集，也是可以的，但是是不严谨的，有把测试数据偷偷告诉模型的嫌疑，所有的数据模型极影见过了
# 所以严谨的是先把数据分为训练集和测试集，然后把训练集分为验证集和训练集，再进行交叉验证，之后在test上进行验证好了就是泛化能力好
# 但是其实问题不大，加入原本分出来就是不是合适的数据呢，交叉验证本身就是一个不是很严谨的东西，所以没有很大的影响


In [None]:
CVS(reg, x_train, y_train, cv=5, scoring="neg_mean_squared_error").mean() # -16

In [None]:
# 查看sklearn中的全部评价指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
# 使用随机森林和线性回归进行对比
rfr = RFR(n_estimators=100)
CVS(rfr, x_train, y_train, cv=5).mean() # 0.80
CVS(rfr, x_train, y_train, scoring="neg_mean_squared_error").mean() # -18
lr = LinearR()
CVS(lr, x_train, y_train, cv=5).mean() # 0.68
CVS(lr, x_train, y_train, scoring="neg_mean_squared_error").mean() # -26

In [None]:
# 学习曲线
from sklearn.model_selection import learning_curve
# 输入我的分类器，一次画出所有的学习曲线
def plot_learning_curve(estimator, title, X, y,
                        ax, # 选择子图
                        ylim=None, #设置纵坐标的取值范围
                        cv=None, #交叉验证
                        n_jobs=None#设定要素使用的线程
                        ):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv,n_jobs=n_jobs)
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim) # 保持y轴的量纲相同，使得对比时更加直观
    ax.set_xlabel("training example")
    ax.set_ylabel("score")
    ax.grid() # 显示网格作为背景
    ax.plot(train_sizes, np.mean(train_scores, axis=1), "o-", color="r", label="training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), "o-", color="g", label= "test score")
    ax.legend(loc="best")
    return ax



In [None]:
cv = KFold(n_splits=5, shuffle= True, random_state= 42)
# 交叉验证模式，分5份，在分5份之前打乱数据
plot_learning_curve(XGBR(n_estimators=100, random_state=420), "XGB", x_train, y_train, ax=None,cv=cv)
plt.show()
# 经常是过拟合的
# 训练的好，测试的不好,怎么消除过拟合呢


In [None]:
# 看n_eatimators的学习曲线
axisx = range(10,1010,50)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=i, random_state=420)
    rs.append(CVS(reg, x_train, y_train, cv=cv).mean())
print(axisx[rs.index(max(rs))], max(rs))
plt.figure(figsize=(20, 5))
plt.plot(axisx, rs, C="red", label="XGB")
plt.legend()
plt.show()
# 30到1000棵基本是差不多的，660棵树判断500多个样本，不太合理

In [None]:
# 进化的学习曲线 方差与泛化误差
axisx = range(50,1010,50)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i, random_state=420)
    cvresult = CVS(reg, x_train, y_train, cv=cv)
    # 记录偏差
    rs.append(cvresult.mean())
    # 记录方差
    var.append(cvresult.var())
    # 计算泛化误差的可控部分
    ge.append(1 - cvresult.mean() ** 2 + cvresult.var())
# 参数的r^2,方差
print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])

print(axisx[rs.index(min(rs))], min(rs), var[rs.index(min(rs))])

print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
plt.figure(figsize=(20, 5))
plt.plot(axisx, rs, C="red", label="XGB")
plt.legend()
plt.show()
# 650棵树偏差最低
# 50棵树方差最低
#150 棵树是泛化误差最小，最好的点




In [None]:
#添加方差线
# 进化的学习曲线 方差与泛化误差
axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i, random_state=420)
    cvresult = CVS(reg, x_train, y_train, cv=cv)
    # 记录偏差
    rs.append(cvresult.mean())
    # 记录方差
    var.append(cvresult.var())
    # 计算泛化误差的可控部分
    ge.append(1 - cvresult.mean() ** 2 + cvresult.var())
# 参数的r^2,方差
print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])

print(axisx[rs.index(min(rs))], min(rs), var[rs.index(min(rs))])

print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
rs = np.array(rs)
var = np.array(var) * .01
plt.figure(figsize=(20, 5))
plt.plot(axisx, rs, C="black", label="XGB")
# 添加方差线
plt.plot(axisx, rs + var, C="red", linestyle="-.")
plt.plot(axisx, rs - var, C="red", linestyle="-.")
plt.legend()
plt.show()

In [None]:
# 验证模型是否提高
time0 = time()
print(XGBR(n_estimators=100, random_state=420).fit(x_train, y_train).score(x_valid, y_valid))
print(time() - time0)

time0 = time()
print(XGBR(n_estimators=660, random_state=420).fit(x_train, y_train).score(x_valid, y_valid))
print(time() - time0)

time0 = time()
print(XGBR(n_estimators=180, random_state=420).fit(x_train, y_train).score(x_valid, y_valid))
print(time() - time0)



In [None]:
# 接下来讲解xgboost本身相关
for booster in ["gbtree", "gblinear", "dart"]:
    reg = XGBR(n_estimators=100, learning_rate=0.1, random_state=420, booster=booster).fit(x_train, y_train)
    print(booster)
    print(reg.score(x_valid, y_valid))

# boston房价是非线性的数据

# objective参数
# 看一下笔记吧


In [None]:
# xgb实现法
import xgboost as xgb
# 使用Dmatrix读取数据
dtrain = xgb.DMatrix(x_train, y_train) # 要把特征矩阵和标签都传入才行
dvalid = xgb.DMatrix(x_valid, y_valid)
import pandas as pd
pd.DataFrame(x_train)# 不能打开dtrain，只能提前看

In [None]:
# 写明参数
# reg:linear is decrea... in favor of squarederror
param = {"silent": False, "objective":"reg:squarederror", "eta":0.1}
num_round = 180
bst = xgb.train(param, dtrain, num_round)
bst
preds = bst.predict(dvalid)

In [None]:
preds
from sklearn.metrics import r2_score
r2_score(y_valid, preds) # 0.92
MSE(y_valid,preds) # 6.87 
# xgboost底层代码比sklearn要快的多

In [None]:
# gamma参数用sklearn的学习曲线非常的波动，没有什么规律
# 我们可以使用xgboost中自带的交叉验证方式
import xgboost as xgb
# 为了便捷使用全数据
dfull = xgb.DMatrix(X, y)
param1 = {"silent":False, "obj":"reg:linear", "gamma":0}
num_round = 180
n_fold = 5 # 相当于sklearn中的KFold，就是代表交叉验证
#
time1 = time()
cvresult1 = xgb.cv(param1,dfull,num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time1).strftime("%M:%S:%f"))
cvresult1



# 我们定了180轮，就是生成了180*4的交叉验证结果，也就是生成了180棵树，每棵树上都进行了交叉验证








In [None]:
plt.figure(figsize=(20, 5))
plt.grid()
plt.plot(range(1, 181),cvresult1.iloc[:, 0],c="yellow",label="train gamma=0")
plt.plot(range(1, 181),cvresult1.iloc[:, 2],c="red",label="test gamma=0")
plt.legend()
plt.show()


#一个先下降再平稳的曲线
# xgboost内嵌的评价指标
# rmse 回归用，调整后的均方误差
# mae 回归用，绝对平均误差
#logloss 二分类用，对数损失
# mlogloss 多分类用，对数损失
#error 分类用，分类误差，等于1-准确率
#auc 分类用的auc面积
# param1 = {"silent":False, "obj":"reg:linear", "gamma":0, "eval_metric":"mae"}


In [None]:
# 开始调参
dfull = xgb.DMatrix(X, y)
param1 = {"silent":False, "obj":"reg:linear", "gamma":0}
param2 = {"silent":False, "obj":"reg:linear", "gamma":20}
num_round = 180
n_fold = 5 # 相当于sklearn中的KFold，就是代表交叉验证
#
time1 = time()
cvresult1 = xgb.cv(param1,dfull,num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time1).strftime("%M:%S:%f"))

time1 = time()
cvresult2 = xgb.cv(param2,dfull,num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time1).strftime("%M:%S:%f"))

plt.figure(figsize=(20, 5))
plt.grid()
plt.plot(range(1, 181),cvresult1.iloc[:, 0],c="yellow",label="train gamma=0")
plt.plot(range(1, 181),cvresult1.iloc[:, 2],c="red",label="test gamma=0")
plt.plot(range(1, 181),cvresult2.iloc[:, 0],c="blue",label="train gamma=20")
plt.plot(range(1, 181),cvresult2.iloc[:, 2],c="orange",label="test gamma=20")
plt.legend()
plt.show()
# 所以剪枝是后剪枝 ，测试集基本一样，但是训练集gamma=20 不如0
# 我们可以看出gamma是通过控制训练集上的训练控制的过拟合，只降低训练集上的表现，所以不一定能够增强测试集上的表现，但是一定能够增强模型的泛化能力
# 剪枝不部分都是控制训练集上的学习来防止过拟合的
# 分类也是同样的操作
# 自行改写吧
# 我们倾向于使用这个交叉验证的曲线而不是学习曲线，这个很快


In [None]:
# 开始剪枝的调参
# xgb.train    
# xgb.XGBRegressor 
#max_depth 树的最大深度默认6
#coldample_bytree 默认1， 每次生成树随机抽样特征的比例
#colsample_bylevel 默认1，每次生成树的一层时，随机抽样特征的比例
# colsample_bynode 默认1， N.A 每次生成一个叶子节点时随机抽样特征的比例
#min_child_weight 默认1,一个叶子节点上所需要的最后hi，也就是叶子节点上的二阶导数之和，类似于样本权重
# 其中max_depth 是剪枝最常用的参数
# gamma和max_depth 基本用一个就行
# 这里不是只像提升树那样抽样本，而是还可以抽特征，经过证明抽特征比抽取样本效果还好
# 我们来试验吧！

In [None]:
dfull = xgb.DMatrix(X, y)
param1 = {"silent":True
            ,"obj":"reg:linear"
            ,"subsample":1
            , "max_depth":6
            , "eta":0.3
            , "gamma":0
            , "lambda":1
            , "alpha":0
            , "colsample_bytree":1
            , "colsample_bylevel":1
            , "colsample_bynode":1
            , "nfold":5}
num_round = 200

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig, ax = plt.subplots(1, figsize=(15,10))

ax.set_ylim(top=5)

ax.grid()
ax.plot(range(1, 201), cvresult1.iloc[:, 0],c="red",label="train original")
ax.plot(range(1, 201), cvresult1.iloc[:, 2],c="orange",label="test original")

param2 = {"silent":True
        ,"obj":"reg:linear"
        ,"nfold":5}
param3 = {"silent":True
        ,"obj":"reg:linear"
        ,"nfold":5}
cvresult2 = xgb.cv(param2, dfull, num_round)
cvresult3 = xgb.cv(param3, dfull, num_round)
ax.plot(range(1, 201), cvresult2.iloc[:, 0],c="red",label="train last")
ax.plot(range(1, 201), cvresult2.iloc[:, 2],c="orange",label="test last")
ax.plot(range(1, 201), cvresult3.iloc[:, 0],c="red",label="train this")
ax.plot(range(1, 201), cvresult3.iloc[:, 2],c="orange",label="test this")
ax.legend(fontsize="xx-large") #字体大小
plt.show()

# 调参就是向着param2里面添加不同的参数,过拟合就减少max_depth,当等于2时，比等于3（向param3中加入）的时候效果好一些
# 泛化能力我认为就是测试集的表现，单个参数的话，可能训练集和测试集的差比较重要，改善过拟合后，测试上升，训练下降
# 不管怎么调整我们都不希望测试集上的结果下降,最终发现2比较好

# 再调整下一个eta参数，param2中是上次有了max_depth=2的，param3加了eta=0.1...通过这个进行修改
# 再调整gamma
# 是一种贪心的调参算法
# 比较推荐

In [None]:
###### 保存模型的基本方式
# 使用pickle保存和调用模型
import pickle
dtrain = xgb.DMatrix(x_train, y_train)
#设定参数
param = {"silent":True
            ,"obj":"reg:linear"
            ,"subsample":1
            , "max_depth":6
            , "eta":0.3
            , "gamma":0
            , "lambda":1
            , "alpha":0
            , "colsample_bytree":1
            , "colsample_bylevel":1
            , "colsample_bynode":1
            , "nfold":5}
num_round = 200
bst = xgb.train(param, dfull, num_round)

# 保存模型
pickle.dump(bst,open("xgboostonboston.dat","wb"))
# 注意，open中我们往往用w或者r作为读取的方式，但这只能用于文本文件，当我们希望导入的不是文本文件而是模型本身的时候，我们有
#wb wr作为读取的模式，其中wb表示二进制写入，wb表示二进制读入

# 看看模型被保存在那里
import sys
sys.path





In [None]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
#from sklearn.metrics import mean_squard_error as MSE
import pickle
import xgboost as xgb
data = load_boston()
X = data.data
y = data.target
x_train, x_valid, y_train, y_valid = TTS(X, y, test_size=0.3,random_state=420)


# 注意，如果我们保存的模型是xgboost库中建立的模型，则导入的数据类型也必须是xgboost库中的数据类型
dtest = xgb.DMatrix(x_valid, y_valid)
# 导入模型
loaded_model = pickle.load(open("xgboostonboston.dat","rb"))
print("loaded model from:xgboostonboston.dat")

# 做预测
ypreds = loaded_model.predict(dtest)

from sklearn.metrics import mean_squared_error as MSE
MSE(y_valid, ypreds)






In [None]:
# 用joblib存储
bst = xgb.train(param, dtrain,num_round)
import joblib
joblib.dump(bst,"xgboost-boston.dat")
loaded_model = joblib.load("xgboost-boston.dat")
ypreds = loaded_model.predict(dtest)
MSE(y_valid, ypreds)
r2_score(y_valid, ypreds)



In [None]:
# 假设使用sklearn
from xgboost import XGBRegressor as XGBR

bst = XGBR(n_estimators=200)
joblib.dump(bst, "xgboost-boston.dat")
loaded_model = joblib.load("xgboost-boston.dat")

# 保存的都是训练完的模型,
# 这里就可以直接输入numpy结构了
loaded_model.fit(x_train,y_train)
ypreds = loaded_model.predict(x_valid)



In [None]:
#xgb中的样本不均衡问题，分类问题中
#scale_pos_weight 控制负/正的比例
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc 

In [None]:
class_1 = 500
class_2 = 50
centers = [[0.0, 0.0], [2.0, 2.0]]
clusters_std = [1.5, 0.5]
X, y = make_blobs(n_samples=[class_1, class_2],
                    centers=centers,
                    cluster_std=clusters_std,
                    random_state=0, shuffle=False)

x_train, x_valid, y_train, y_valid = TTS(X, y, test_size=0.3, random_state=420)
(y == 1).sum() / y.shape[0]


clf = XGBC().fit(x_train, y_train)
# clf = XGBC(scale_pos_weight=10).fit(x_train, y_train)
# 学习曲线一下发现20比较好
ypred = clf.predict(x_valid)
clf.score(x_valid,y_valid) # 默认返回准确率
cm(y_valid, ypred,labels=[1,0])
recall(y_valid, ypred)
auc(y_valid, clf.predict_proba(x_valid)[:, 1])



In [None]:
# 用xgboost格式做一下
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_valid, y_valid)
param = {"object":"binary:logistic", "scale_pos_weight":1}
num_round = 100

bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

preds
# 返回的是分类的概率


# 自己设定阈值
ypred = preds.copy()
ypred[ypreds > 0.5] = 1
ypred[ypred != 1] = 0

# 打印一下从scale_pos_weight 的改变我们的指标怎么改变
# for i in zip(names, scale_pos_weight):
# 找到最好的调参就是要不调节阈值，要不就调节scale_pos_weight

# 其他参数和用法
# n_jobs 使用的线程数
# base_socre 分类问题中的先验概率，正样本/负样本的比例，回归中一般是0.5
# random_state :生成树的随机模式
# missing自动处理缺失值
# 稀疏矩阵的时候可以直接缺失值当作0






