对4万条业务A数据采用相同的方式做特征工程，采用XGBoost建立模型B（以A数据为训练数据，B数据为验证数据调参），将模型B作为我们的迁移模型，再在模型B的基础上利用4千条业务B数据进行微调获得模型C，使模型C在能更好的拟合4千条业务B中的数据。

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc,roc_auc_score
import xgboost as xgb
from sklearn.cross_validation import train_test_split 



In [4]:
a_train = pd.read_csv('A_train.csv')
b_train = pd.read_csv('B_train.csv')
b_test =  pd.read_csv('B_test.csv')

In [5]:
a_labels = a_train['flag']
b_labels = b_train['flag']

a_train.drop('no',axis=1,inplace=True) 
b_train.drop('no',axis=1,inplace=True) 
a_train.drop('flag',axis=1,inplace=True) 
b_train.drop('flag',axis=1,inplace=True) 

submit = pd.DataFrame(b_test['no'])
b_test.drop('no',axis=1,inplace=True) 

In [10]:
# 填充空值数据
a_train = a_train.fillna(-999)
b_train = b_train.fillna(-999)
b_test = b_test.fillna(-999)

In [11]:
# 处理离散化特征

def dumyuserfeature(train):
    # 检查离散变量和连续变量
    asd = 0
    index=[]
    train_copy = train.copy()
    for i ,col in enumerate(train.columns):
        cofe = len(train.groupby(col).count())
        if cofe < 20:
            feikong = np.sum([train[col] != -999] )
            if feikong < len(train) * 0.1:
                continue 
            train_copy = train_copy.join(pd.get_dummies(train[col], prefix=col+'_'))
            index.append(col)
            asd += 1
    print(asd,'个离散化的特征')
    return train_copy

a_train_dummy = dumyuserfeature(a_train)
b_train_dummy = dumyuserfeature(b_train)
b_test_dummy = dumyuserfeature(b_test)

313 个离散化的特征
295 个离散化的特征
285 个离散化的特征


In [12]:
# 获取共有列

col = [x for x in b_train_dummy.columns if x in b_test_dummy.columns]  
col = [x for x in col if x in a_train_dummy.columns] 
col = [x for x in col if x not in ['no','flag']]  

In [13]:
a_train_dummy_final = a_train_dummy[col]
b_train_dummy_final = b_train_dummy[col]
b_test_dummy_final = b_test_dummy[col]
watchlist=[(xgb.DMatrix(a_train_dummy_final,label=a_labels),'train'),(xgb.DMatrix(b_train_dummy_final,label=b_labels),'eval')]

In [16]:
# 训练获得迁移使用的源模型B
# 通过A数据集训练，B数据集验证

Trate = 0.15
params = {
    'booster':'gbtree',
    'eta': 0.5,
    'max_depth': 5,
    'max_delta_step': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,      
    'base_score': Trate, 
    'objective': 'binary:logistic', 
    'lambda':3,
    'alpha':8
}

params['eval_metric'] = 'auc'

model_B = xgb.train(params,xgb.DMatrix(a_train_dummy_final,label=a_labels),num_boost_round=1000,evals=watchlist,early_stopping_rounds=100,maximize=True,verbose_eval=True)
#train-auc:0.965377	eval-auc:0.571919

[0]	train-auc:0.698705	eval-auc:0.51254
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 100 rounds.
[1]	train-auc:0.722395	eval-auc:0.522195
[2]	train-auc:0.732005	eval-auc:0.523028
[3]	train-auc:0.743631	eval-auc:0.531681
[4]	train-auc:0.75025	eval-auc:0.530939
[5]	train-auc:0.759242	eval-auc:0.532871
[6]	train-auc:0.763683	eval-auc:0.538011
[7]	train-auc:0.76731	eval-auc:0.539708
[8]	train-auc:0.772315	eval-auc:0.538986
[9]	train-auc:0.777057	eval-auc:0.536813
[10]	train-auc:0.780032	eval-auc:0.538695
[11]	train-auc:0.782809	eval-auc:0.539869
[12]	train-auc:0.784513	eval-auc:0.539338
[13]	train-auc:0.78729	eval-auc:0.540207
[14]	train-auc:0.789914	eval-auc:0.540083
[15]	train-auc:0.791993	eval-auc:0.539705
[16]	train-auc:0.795825	eval-auc:0.540057
[17]	train-auc:0.797984	eval-auc:0.540508
[18]	train-auc:0.798983	eval-auc:0.541031
[19]	train-auc:0.800968	eval-auc:0.544316
[20]	train-auc:0.803208	eval-auc

In [17]:
# # 存储模型
# import pickle
# from sklearn.externals import joblib
# joblib.dump(model_phase_1, 'model_transfer.pkl')
# model_phase_1 = joblib.load('model_transfer.pkl')  

In [18]:
# 在源数据上进行微调，这边没有时间就没有细调，线上的结果是单模型随机试了一个（迭代20次）的结果
# 使用数据集B进行训练

Trate=0.2 
params = {'booster':'gbtree',
              'eta': 0.05, 
              'max_depth': 4,                  
              'max_delta_step': 0,
              'subsample':1,              
              'colsample_bytree': 0.9,      
              'base_score': Trate, 
              'objective': 'binary:logistic', 
              'lambda':3,
              'alpha':5
              }
params['eval_metric'] = 'auc' 
model_phase_1_cla_1 = xgb.train(params,xgb.DMatrix(b_train_dummy_final,b_labels),num_boost_round=25,xgb_model =model_B,maximize=True,verbose_eval=True)


Trate=0.2 
params = {'booster':'gbtree',
              'eta': 0.05, 
              'max_depth': 5,                  
              'max_delta_step': 0,
              'subsample':0.85,              
              'colsample_bytree': 0.9,      
              'base_score': Trate, 
              'objective': 'binary:logistic', 
              'lambda':3,
              'alpha':5
              }
params['eval_metric'] = 'auc' 
model_phase_1_cla_2 = xgb.train(params,xgb.DMatrix(b_train_dummy_final,b_labels),num_boost_round=40,xgb_model =model_B,maximize=True,verbose_eval=True)


Trate=0.2 
params = {'booster':'gbtree',
              'eta': 0.05, 
              'max_depth': 4,                  
              'max_delta_step': 0,
              'subsample':1,              
              'colsample_bytree': 0.9,      
              'base_score': Trate, 
              'objective': 'binary:logistic', 
              'lambda':3,
              'alpha':5
              }
params['eval_metric'] = 'auc' 
model_phase_1_cla_3 = xgb.train(params,xgb.DMatrix(b_train_dummy_final,b_labels),num_boost_round=28,xgb_model =model_B,maximize=True,verbose_eval=True)

Trate=0.25 
params = {'booster':'gbtree',
              'eta': 0.05, 
              'max_depth': 5,                  
              'max_delta_step': 0,
              'subsample':1,              
              'colsample_bytree': 0.9,      
              'base_score': Trate, 
              'objective': 'binary:logistic', 
              'lambda':3,
              'alpha':6
              }
params['eval_metric'] = 'auc' 
model_phase_1_cla_4 = xgb.train(params,xgb.DMatrix(b_train_dummy_final,b_labels),num_boost_round=30,xgb_model =model_B,maximize=True,verbose_eval=True)

In [20]:
# ensemble取均值，这边随机选了四个
pred = model_phase_1_cla_1.predict(xgb.DMatrix(b_test_dummy_final))
pred1 = model_phase_1_cla_2.predict(xgb.DMatrix(b_test_dummy_final))
pred2 = model_phase_1_cla_3.predict(xgb.DMatrix(b_test_dummy_final))
pred3 = model_phase_1_cla_4.predict(xgb.DMatrix(b_test_dummy_final))
submit['pred'] =(pred+pred1+pred2+pred3)/4

In [21]:
submit.to_csv('transfer_submit.csv',index=False)

In [None]:
# 线上auc：0.589997