In [15]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook, tnrange
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge,LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import xgboost as xgb
import catboost as catb
import os
warnings.filterwarnings("ignore")
%matplotlib inline

### Stacking Level I

In [20]:
##### just stacking
df_stacking_train = pd.read_csv('./datasets/stacking/level1/df_stacking_train.csv')
df_stacking_test = pd.read_csv('./datasets/stacking/level1/df_stacking_test.csv')

path = './datasets/stacking/level1/'
sublist = os.listdir(path)
sublist.remove('df_stacking_test.csv')
sublist.remove('df_stacking_train.csv')

for sub in sublist:
    df = pd.read_csv(path+sub)
    if 'train' in sub:
        df_stacking_train = df_stacking_train.merge(df,on='card_id',how='left')
    else:
        df_stacking_test = df_stacking_test.merge(df,on='card_id',how='left')

df_stacking_train.shape,df_stacking_test.shape

((201917, 54), (123623, 53))

#### BayesianRidge

In [25]:
NFOLDS = 5
# kfold = KFold(n_splits=NFOLDS,shuffle=False,random_state=5)
kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=20)
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]

ntrain = df_stacking_train.shape[0]
ntest = df_stacking_test.shape[0]
oof_train_pred = np.zeros((ntrain,))
oof_test_pred = np.zeros((ntest,))
oof_test_pred_skf = np.empty((NFOLDS, ntest))
for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_stacking_train,df_stacking_train['is_outlier'])):
    model = BayesianRidge(
        lambda_1=50,
        lambda_2=0.1,
        normalize=True,
        tol=0.00001,
    )
    x_dev = df_stacking_train[tr_features].iloc[dev_index]
    y_dev = df_stacking_train[label].iloc[dev_index]
    x_val = df_stacking_train[tr_features].iloc[val_index]
    y_val = df_stacking_train[label].iloc[val_index]
    model.fit(x_dev.values,y_dev.values)
    oof_test_pred_skf[foldIndex,:] = model.predict(df_stacking_test[tr_features].values).reshape(-1,)
    oof_train_pred[val_index] = model.predict(x_val.values).reshape(-1,)

oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
score = np.sqrt((np.sum(np.square(oof_train_pred - df_stacking_train[label].values.reshape(-1,)))/ntrain))

df_sub = pd.DataFrame(data=df_stacking_test['card_id'].values,columns=['card_id'])
df_sub['target'] = oof_test_pred
print("score = %s"%score)

score = 3.630509504160885


In [None]:
df_stacking_train['stack_bayes_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_bayes_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_bayes_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_bayes_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_bayes_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_bayes_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_bayes_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_bayes_pred_%.5f'%score],inplace=True)

In [None]:
df_sub.to_csv('./submission/df_stackingII_sub_%.5f.csv'%score,index=False)

#### LinearRegression

In [None]:
NFOLDS = 5
# kfold = KFold(n_splits=NFOLDS,shuffle=False,random_state=5)
kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=20)
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]

ntrain = df_stacking_train.shape[0]
ntest = df_stacking_test.shape[0]
oof_train_pred = np.zeros((ntrain,))
oof_test_pred = np.zeros((ntest,))
oof_test_pred_skf = np.empty((NFOLDS, ntest))
for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_stacking_train,df_stacking_train['is_outlier'])):
    model = LinearRegression(normalize=True)
    x_dev = df_stacking_train[tr_features].iloc[dev_index]
    y_dev = df_stacking_train[label].iloc[dev_index]
    x_val = df_stacking_train[tr_features].iloc[val_index]
    y_val = df_stacking_train[label].iloc[val_index]
    model.fit(x_dev.values,y_dev.values)
    oof_test_pred_skf[foldIndex,:] = model.predict(df_stacking_test[tr_features].values).reshape(-1,)
    oof_train_pred[val_index] = model.predict(x_val.values).reshape(-1,)
oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
score = np.sqrt((np.sum(np.square(oof_train_pred - df_stacking_train[label].values.reshape(-1,)))/ntrain))

df_sub = pd.DataFrame(data=df_stacking_test['card_id'].values,columns=['card_id'])
df_sub['target'] = oof_test_pred
print("score = %s"%score)

In [None]:
df_stacking_train['stack_lr_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_lr_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_lr_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_lr_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_lr_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_lr_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_lr_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_lr_pred_%.5f'%score],inplace=True)

#### SVR

In [None]:
from sklearn.svm import SVR
def modelKFoldReg(df_train,df_test,model):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index].values
        y_dev = df_train[label].iloc[dev_index].values
        x_val = df_train[tr_features].iloc[val_index].values
        y_val = df_train[label].iloc[val_index].values
        model.fit(x_dev, y_dev)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features].values).reshape(-1,)
        oof_train_pred[val_index] = model.predict(x_val).reshape(-1,)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
        
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred

svr_params = {
   'gamma':0.1,
    'C':1.0,
    'epsilon':0.2    
}
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]
model = SVR(**svr_params)
score,oof_test_pred,oof_train_pred = modelKFoldReg(df_stacking_train,df_stacking_test,model)

df_sub = pd.DataFrame(data=df_stacking_test['card_id'].values,columns=['card_id'])
df_sub['target'] = oof_test_pred
print("score = %s"%score)

In [None]:
df_stacking_train['stack_svr_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_svr_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_svr_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_svr_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_svr_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_svr_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_svr_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_svr_pred_%.5f'%score],inplace=True)

#### LGBMRegressor

In [None]:
def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=20)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("...........第%s折............."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=10,verbose=10)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features],num_iteration=model.best_iteration_)
        oof_train_pred[val_index] = model.predict(x_val,num_iteration=model.best_iteration_)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred

lgb_params ={
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators':500,
    'learning_rate': 0.05,
    'subsample': 0.9,
    'max_depth': 3,
    'min_child_weight': 45,
    'reg_alpha': 100,
    'colsample_bytree': 0.8,
    'min_split_gain': 10,
    'reg_lambda': 100,
    'min_data_in_leaf': 21,
    'verbose': -1,
    'seed':20,
    'bagging_seed':42,
    'device': 'gpu',
    'gpu_platform_id':1,
    'gpu_device_id': 1,
}
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]

lgb_est = lgb.LGBMRegressor(**lgb_params)
score,oof_test_pred,oof_train_pred = modelKFoldReg(df_stacking_train,df_stacking_test,lgb_est)
print("score = %s"%score)

In [None]:
df_stacking_train['stack_lgb_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_lgb_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_lgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_lgb_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_lgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_lgb_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_lgb_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_lgb_pred_%.5f'%score],inplace=True)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
def modelKFoldReg(df_train,df_test,model):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index].values
        y_dev = df_train[label].iloc[dev_index].values
        x_val = df_train[tr_features].iloc[val_index].values
        y_val = df_train[label].iloc[val_index].values
        model.fit(x_dev, y_dev)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features].values).reshape(-1,)
        oof_train_pred[val_index] = model.predict(x_val).reshape(-1,)
        
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred

rf_params = {
    'n_estimators':800,
    'max_depth':6,
    'criterion':'mse',
    'bootstrap':True,
    'verbose':1,
    'n_jobs':6,
    'random_state':45
}
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]
model = RandomForestRegressor(**rf_params)
score,oof_test_pred,oof_train_pred = modelKFoldReg(df_stacking_train,df_stacking_test,model)

print("score = %s"%score)

In [None]:
df_stacking_train['stack_rf_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_rf_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_rf_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_rf_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_rf_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_rf_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_rf_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_rf_pred_%.5f'%score],inplace=True)

#### XGBRegressor

In [None]:
def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=50)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print('..............第%s折............'%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=50,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features],ntree_limit=model.best_ntree_limit)
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred
    
xgb_params = {
    'objective': 'reg:linear',
    'booster': 'gbtree',
    'learning_rate': 0.01,
    'n_estimators':500,
    'max_depth': 4,
    'gamma' : 1.45,
    'alpha': 0.1,
    'lambda': 25,
    'subsample': 0.9,
    'colsample_bytree': 0.054,
    'colsample_bylevel': 0.50,
    'random_state': 2018
}

df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]

xgb_est = xgb.XGBRegressor(**xgb_params)
score,xgb_test_pred,xgb_train_pred = modelKFoldReg(df_stacking_train,df_stacking_test,xgb_est)
print("score = %s"%score)

In [None]:
df_stacking_train['stack_xgb_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_xgb_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_xgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_xgb_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_xgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_xgb_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_xgb_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_xgb_pred_%.5f'%score],inplace=True)

#### CatBoostRegressor

In [None]:
def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=20)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print('..............第%s折............'%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],use_best_model=True,early_stopping_rounds=30,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features])
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred

cat_params = {
    'n_estimators':500,
    'learning_rate':0.01,
    'max_depth':4,
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
    'logging_level':'Verbose',
    'random_state':2018,
    'bagging_temperature':0.8,
    'l2_leaf_reg':200,
    'od_type':'Iter',
    'thread_count':16
}
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]

cat_est = catb.CatBoostRegressor(**cat_params)
score,cat_test_pred,cat_train_pred = modelKFoldReg(df_stacking_train,df_stacking_test,cat_est)
print("score = %s"%score)

In [None]:
df_stacking_train['stack_cat_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_cat_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_cat_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_cat_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_cat_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_cat_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_cat_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_cat_pred_%.5f'%score],inplace=True)

#### ExtraTreesRegressor

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
def modelKFoldReg(df_train,df_test,model):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index].values
        y_dev = df_train[label].iloc[dev_index].values
        x_val = df_train[tr_features].iloc[val_index].values
        y_val = df_train[label].iloc[val_index].values
        model.fit(x_dev, y_dev)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features].values).reshape(-1,)
        oof_train_pred[val_index] = model.predict(x_val).reshape(-1,)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
        
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred

extr_params = {
    'n_estimators':800,
    'max_depth':5,
    'criterion':'mse',
    'bootstrap':True,
    'verbose':1,
    'n_jobs':6,
    'random_state':34
}
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]
model = ExtraTreesRegressor(**extr_params)
score,oof_test_pred,oof_train_pred = modelKFoldReg(df_stacking_train,df_stacking_test,model)

print("score = %s"%score)

In [None]:
df_stacking_train['stack_extr_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_extr_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_extr_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_extr_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_extr_pred_%.5f'%score]].to_csv('./datasets/stacking/level2/df_stack_extr_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_extr_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_extr_pred_%.5f'%score],inplace=True)

### Stacking Level II

In [None]:
df_stacking_train = pd.read_csv('./datasets/stacking/level2/df_stacking_train.csv')
df_stacking_test = pd.read_csv('./datasets/stacking/level2/df_stacking_test.csv')

path = './datasets/stacking/level2/'
sublist = os.listdir(path)
sublist.remove('df_stacking_test.csv')
sublist.remove('df_stacking_train.csv')

for sub in sublist:
    df = pd.read_csv(path+sub)
    if 'train' in sub:
        df_stacking_train = df_stacking_train.merge(df,on='card_id',how='left')
    else:
        df_stacking_test = df_stacking_test.merge(df,on='card_id',how='left')

df_stacking_train.head()

#### BayesianRidge

In [None]:
NFOLDS = 5
# kfold = KFold(n_splits=NFOLDS,shuffle=False,random_state=5)
kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=20)
df_stacking_train['is_outlier'] = (df_stacking_train.target<-30).astype(np.int)
label = ['target']
tr_features = [_f for _f in df_stacking_train.columns if _f not in ['card_id','target','is_outlier']]

ntrain = df_stacking_train.shape[0]
ntest = df_stacking_test.shape[0]
oof_train_pred = np.zeros((ntrain,))
oof_test_pred = np.zeros((ntest,))
oof_test_pred_skf = np.empty((NFOLDS, ntest))
for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_stacking_train,df_stacking_train['is_outlier'])):
    model = BayesianRidge(normalize=True)
    x_dev = df_stacking_train[tr_features].iloc[dev_index]
    y_dev = df_stacking_train[label].iloc[dev_index]
    x_val = df_stacking_train[tr_features].iloc[val_index]
    y_val = df_stacking_train[label].iloc[val_index]
    model.fit(x_dev.values,y_dev.values)
    oof_test_pred_skf[foldIndex,:] = model.predict(df_stacking_test[tr_features].values).reshape(-1,)
    oof_train_pred[val_index] = model.predict(x_val.values).reshape(-1,)
oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
score = np.sqrt((np.sum(np.square(oof_train_pred - df_stacking_train[label].values.reshape(-1,)))/ntrain))

df_sub = pd.DataFrame(data=df_stacking_test['card_id'].values,columns=['card_id'])
df_sub['target'] = oof_test_pred
print("score = %s"%score)

In [None]:
df_stacking_train['stack_bayes_pred_%.5f'%score] = oof_train_pred
df_stacking_test['stack_bayes_pred_%.5f'%score] = oof_test_pred
df_stacking_train[['card_id','stack_bayes_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_stack_bayes_train_pred_%.5f.csv'%score,index=False)
df_stacking_test[['card_id','stack_bayes_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_stack_bayes_test_pred_%.5f.csv'%score,index=False)

df_stacking_train.drop(columns=['stack_bayes_pred_%.5f'%score],inplace=True)
df_stacking_test.drop(columns=['stack_bayes_pred_%.5f'%score],inplace=True)