In [None]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostRegressor,ExtraTreesRegressor,RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge,LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime
import gc
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")
%matplotlib

In [None]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [None]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
del df_data
gc.collect()

print(len(tr_features))

In [None]:
def modelKFoldReg(df_train,df_test,model):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index].values
        y_dev = df_train[label].iloc[dev_index].values
        x_val = df_train[tr_features].iloc[val_index].values
        y_val = df_train[label].iloc[val_index].values
        model.fit(x_dev, y_dev)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features].values).reshape(-1,)
        oof_train_pred[val_index] = model.predict(x_val).reshape(-1,)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
        
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred

ada_params = {
#     'base_estimator':
    'n_estimators':1000,
    'learning_rate':0.01,
    'loss':'square',
    'random_state':42
}
model = AdaBoostRegressor(**ada_params)
model,score,oof_test_pred,oof_train_pred = modelKFoldReg(df_train,df_test,model)

print("score = %s"%score)

In [None]:
df_train['oof_rf_pred_%.5f'%score] = oof_train_pred
df_test['oof_rf_pred_%.5f'%score] = oof_test_pred
df_train[['card_id','oof_rf_pred_%.5f'%score]].to_csv('./datasets/stacking/h2orf_train_pred_%.5f.csv'%score)
df_test[['card_id','oof_rf_pred_%.5f'%score]].to_csv('./datasets/stacking/h2orf_test_pred_%.5f.csv'%score)