In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
#xgboost模型里，对于缺失值有自己的处理方式。
#所以用xgboost不需要把所有缺失值全部进行处理，但对于一些业务角度很容易填补的缺失值还是建议填充
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler

dataset1=pd.read_csv(r'E:\code\o2o\data\dataset1.csv')
dataset1.label.replace(-1,0,inplace=True)
dataset2=pd.read_csv(r'E:\code\o2o\data\dataset2.csv')
dataset2.label.replace(-1,0,inplace=True)
dataset3=pd.read_csv(r'E:\code\o2o\data\dataset3.csv')

dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)

#将训练集一和训练集二合并，作为调参后的总训练数据集
dataset12=pd.concat([dataset1,dataset2],axis=0)

#这里删除了两个特征day_gap_before和day_gap_after，据原作者描述是由于这两个特征容易导致过拟合
#我们也可以不删除，跑模型调参试试，再根据效果进行特征的筛选
dataset1_y=dataset1.label
dataset1_x=dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
dataset2_y=dataset2.label
dataset2_x=dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
dataset12_y=dataset12.label
dataset12_x=dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
dataset3_preds=dataset3[['user_id','coupon_id','date_received']]
dataset3_x=dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)

print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape,dataset12_x.shape)



In [None]:
#转换为xgb需要的数据类型
dataset1=xgb.DMatrix(dataset1_x,label=dataset1_y)
dataset2=xgb.DMatrix(dataset2_x,label=dataset2_y)
dataset12=xgb.DMatrix(dataset12_x,label=dataset12_y)
dataset3=xgb.DMatrix(dataset3_x)

#下面参数是原作者根据上面的训练集一和训练集二，一个作为训练，一个作为测试，来调过的参数
#由于xgboost调参很费时，本文就对于这部分过程也没有一步步的来找合适的参数，直接用原作者的参数
params={'booster':'gbtree',
       'objective':'rank:pairwise',
       'eval_metric':'auc',
       'gamma':0.1,
       'min_child_weight':1.1,
       'max_depth':5,
       'lambda':10,
       'subsample':0.7,
       'colsample_bytree':0.7,
       'colsample_bylevel':0.7,
       'eta':0.01,
       'tree_method':'exact',
       'seed':0,
       'nthread':12}

#训练模型
watchlist=[(dataset12,'train')]
model=xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist)

#predict test set
dataset3_preds['label']=model.predict(dataset3)
dataset3_preds.label=MinMaxScaler().fit_transform(np.array(dataset3_preds.label).reshape(-1,1))
dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)

dataset3_preds.describe()

dataset3_preds.to_csv(r'E:\code\o2o\data\xgb_preds.csv',index=None,header=None)

#save feature score
#这一步可以输出各特征的重要性，可以作为特征筛选的一种方式
feature_score=model.get_fscore()
feature_score=sorted(feature_score.items(),key=lambda x:x[1],reverse=True)

fs=[]
for (key,value) in feature_score:
    fs.append('{0},{1}\n'.format(key,value))
    
with open(r'E:\code\o2o\data\xgb_feature_score.csv','w') as f:
    f.writelines('feature,score\n')
    f.writelines(fs)

feature_score

