In [None]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook, tnrange
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge,LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import xgboost as xgb
import catboost as catb
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")
%matplotlib inline

### 模型训练

In [None]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

In [None]:
df_cate_statics = pd.read_csv('./datasets/df_cate_statics.csv')
df_data = df_data.merge(df_cate_statics,on='card_id',how='left')

In [None]:
df_cate_merchantCate_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchantCate_fm_feat.csv')
df_cate_merchant_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchant_fm_feat.csv')

df_data = df_data.merge(df_cate_merchantCate_fm,on='card_id',how='left')
df_data = df_data.merge(df_cate_merchant_fm,on='card_id',how='left')

In [None]:
df_card_merchant_statics = pd.read_csv(DATA_PATH+'df_card_merchant_statics.csv')
df_card_merchant_vec = pd.read_csv(DATA_PATH+'df_card_merchant_vec.csv')
df_card_city_statics = pd.read_csv(DATA_PATH+'df_card_city_statics.csv')
df_card_merchant_features = pd.read_csv(DATA_PATH+'df_card_merchant_features.csv')

df_data = df_data.merge(df_card_merchant_features,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_vec,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_statics,on='card_id',how='left')
df_data = df_data.merge(df_card_city_statics,on='card_id',how='left')

In [None]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

In [None]:
from feature_selector import FeatureSelector

fs = FeatureSelector(data = df_train[tr_features], labels = df_train['target'].values)

fs.identify_missing(missing_threshold = 0.12)
fs.identify_collinear(correlation_threshold = 0.98)
fs.identify_zero_importance(task = 'regression', eval_metric = 'rmse', n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.99)
fs.identify_single_unique()

tr_removed = fs.remove(methods = 'all')

tr_features = list(tr_removed.columns)

len(tr_features)

In [None]:
%%time

def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=50,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features])
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred
    
xgb_params = {
    'objective': 'reg:linear',
    'booster': 'gbtree',
    'learning_rate': 0.01,
    'n_estimators':5000,
    'max_depth': 10,
    'gamma' : 1.45,
    'alpha': 0.1,
    'lambda': 0.3,
    'subsample': 0.9,
    'colsample_bytree': 0.054,
    'colsample_bylevel': 0.50,
    'random_state': 2018
}
xgb_est = xgb.XGBRegressor(**xgb_params)
xgb_est,score,oof_test_pred,oof_train_pred = modelKFoldReg(df_train,df_test,xgb_est)

df_test['target'] = oof_test_pred

df_sub = df_test[['card_id','target']]
df_sub.to_csv('./submission/df_xgb_sub_%.5f.csv'%score,index=None)

In [None]:
print("score = %s"%score)

In [None]:
df_train['oof_xgb_pred_%.5f'%score] = oof_train_pred
df_test['oof_xgb_pred_%.5f'%score] = oof_test_pred
df_train[['card_id','oof_xgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_xgb_train_pred_%.5f.csv'%score,index=False)
df_test[['card_id','oof_xgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_xgb_test_pred_%.5f.csv'%score,index=False)

df_train.drop(columns=['oof_xgb_pred_%.5f'%score],inplace=True)
df_test.drop(columns=['oof_xgb_pred_%.5f'%score],inplace=True)

In [None]:
#特征重要性分析
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(xgb_est,max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("XGB - Feature Importance", fontsize=10)
plt.show()