In [1]:
# import 
import pickle
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,roc_curve
import warnings
warnings.filterwarnings('ignore')

!pwd
!ls -a ./raw

/Users/hobee/PythonScript/CCF-BDCI-2022-FXFZ
[34m.[m[m                    dataA.csv            dataTrain.csv
[34m..[m[m                   dataNoLabel.csv      submit_example_A.csv


In [2]:
# load the data
semi_data = pd.read_csv("./cache/semi_label_data.csv")
raw_data = pd.read_csv("./cache/label_data.csv")
label_data = pd.read_csv("./cache/label.csv")

data = semi_data.copy()

In [3]:
# load the model
model_file_path = "./models/vote_model.pkl"
model = pickle.load(open(model_file_path, "rb"))

In [4]:
def train_model(x,y, model="PlaceHolder"):
    x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.3,random_state = 33)
    ### 训练模型
    if model == "PlaceHolder":
        model = xgb.XGBClassifier(learning_rate=0.1,
                      n_estimators=100,            # 树的个数--100棵树建立xgboost
                      max_depth=6,                 # 树的深度
                      min_child_weight = 1,        # 叶子节点最小权重
                      gamma=0.,                    # 惩罚项中叶子结点个数前的参数
                      subsample=0.8,               # 随机选择80%样本建立决策树
                      # colsample_btree=0.8,         # 随机选择80%特征建立决策树
                      objective='binary:logistic', # 指定损失函数
                      scale_pos_weight=1,          # 解决样本个数不平衡的问题
                      random_state=27              # 随机数
                      )

    # 拟合
    # model.fit(x_train, y_train, eval_set = [(x_test,y_test)], eval_metric = "auc", early_stopping_rounds = 10,verbose = True)
    model.fit(x_train, y_train)
    
    return model,x_train,x_test,y_train,y_test

In [5]:
# predict
pred_prob_A = model.predict_proba(data)

In [6]:
# 思路 循环训练 每次将label为概率大于90%的加入训练集重新训练 直到结果无法提升
scope = 0.95
last_auc = 0
epoch = 1
print(data.shape)
print(raw_data.shape)
while True:
    pb = model.predict_proba(data)
    pos_item = data[pb[:,0] > scope]
    neg_item = data[pb[:,1] > scope]
    raw_data = raw_data.append(pos_item,ignore_index=True)
    raw_data = raw_data.append(neg_item,ignore_index=True)
    label_data = label_data.append(
        pd.DataFrame({"label": [1 for i in range(pos_item.shape[0])]}),
        ignore_index=True
    )
    label_data = label_data.append(
        pd.DataFrame({"label": [1 for i in range(neg_item.shape[0])]}),
        ignore_index=True
    )
    data = data.drop(pos_item.index,inplace=False)
    data = data.drop(neg_item.index,inplace=False)
    model,x_train,x_test,y_train,y_test = train_model(raw_data,label_data,model)
    if epoch != 1:
        last_auc = auc
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
    delta = auc - last_auc
    print(f"epoch:{epoch},auc:{auc},last_auc:{last_auc},dalta:{auc - last_auc}")
    print(data.shape)
    print(raw_data.shape)
    epoch += 1
    if abs(delta) < 0.0001 or epoch == 30:
        print(f"stop at epoch:{epoch}!")
        break

model


(39884, 1129)
(50000, 1129)
epoch:1,auc:0.9181336310745656,last_auc:0,dalta:0.9181336310745656
(30930, 1129)
(58954, 1129)
epoch:2,auc:0.9539604129726633,last_auc:0.9181336310745656,dalta:0.03582678189809774
(14321, 1129)
(75563, 1129)
epoch:3,auc:0.9608558262950092,last_auc:0.9539604129726633,dalta:0.006895413322345889
(9772, 1129)
(80112, 1129)
epoch:4,auc:0.9603840444808177,last_auc:0.9608558262950092,dalta:-0.00047178181419149645
(9355, 1129)
(80529, 1129)
epoch:5,auc:0.96011779146354,last_auc:0.9603840444808177,dalta:-0.00026625301727767337
(9240, 1129)
(80644, 1129)
epoch:6,auc:0.9604636876977516,last_auc:0.96011779146354,dalta:0.00034589623421155125
(9160, 1129)
(80724, 1129)
epoch:7,auc:0.959785488433263,last_auc:0.9604636876977516,dalta:-0.0006781992644886436
(9138, 1129)
(80746, 1129)
epoch:8,auc:0.9599198019127004,last_auc:0.959785488433263,dalta:0.00013431347943748673
(9099, 1129)
(80785, 1129)
epoch:9,auc:0.9595734911807661,last_auc:0.9599198019127004,dalta:-0.000346310731

In [7]:
# save model
with open("./models/vote_pesudo_model.pkl", "wb") as file:
    pickle.dump(model,file=file)