In [None]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from feature_selector import FeatureSelector
from tqdm import tqdm_notebook, tnrange
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge,LinearRegression
from scipy.stats import ks_2samp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import xgboost as xgb
import catboost as catb
import os
fea_path = './datasets/'
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df_data = pd.read_csv(fea_path+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(fea_path+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(fea_path+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

del df_additional_features,df_train_test_additional_features
gc.collect()

path = './datasets/feature2/'
sublist = os.listdir(path)

for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')
del df
gc.collect()

df_tsne_fea = pd.read_csv('./datasets/df_tsne_fea.csv')
df_data = df_data.merge(df_tsne_fea,on='card_id',how='left')
df_cate_statics = pd.read_csv('./datasets/df_cate_statics.csv')
df_data = df_data.merge(df_cate_statics,on='card_id',how='left')

del df_cate_statics,df_tsne_fea
gc.collect()

df_cate_merchantCate_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchantCate_fm_feat.csv')
df_cate_merchant_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchant_fm_feat.csv')

df_data = df_data.merge(df_cate_merchantCate_fm,on='card_id',how='left')
df_data = df_data.merge(df_cate_merchant_fm,on='card_id',how='left')

del df_cate_merchantCate_fm,df_cate_merchant_fm
gc.collect()

df_card_merchant_statics = pd.read_csv(fea_path+'df_card_merchant_statics.csv')
df_card_merchant_vec = pd.read_csv(fea_path+'df_card_merchant_vec.csv')
df_card_city_statics = pd.read_csv(fea_path+'df_card_city_statics.csv')

df_data = df_data.merge(df_card_merchant_vec,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_statics,on='card_id',how='left')
df_data = df_data.merge(df_card_city_statics,on='card_id',how='left')

del df_card_merchant_statics,df_card_merchant_vec,df_card_city_statics
gc.collect()

df_nmf_card_merCate_features = pd.read_csv(fea_path+'df_nmf_card_merCate_features.csv')
df_nmf_card_city_features = pd.read_csv(fea_path+'df_nmf_card_city_features.csv')
df_card_merchant_features = pd.read_csv(fea_path+'df_card_merchant_features.csv')

df_data = df_data.merge(df_nmf_card_merCate_features,on='card_id',how='left')
df_data = df_data.merge(df_nmf_card_city_features,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_features,on='card_id',how='left')

del df_nmf_card_merCate_features,df_nmf_card_city_features,df_card_merchant_features
gc.collect()


In [None]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']

fs = FeatureSelector(data = df_train[tr_features], labels = df_train['target'].values)

fs.identify_missing(missing_threshold = 0.12)
fs.identify_collinear(correlation_threshold = 0.98)
fs.identify_zero_importance(task = 'regression', eval_metric = 'rmse', n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.99)
fs.identify_single_unique()

tr_removed = fs.remove(methods = 'all')
tr_features = list(tr_removed.columns)


In [None]:
print(len(tr_features))

In [None]:
df_oof_train = pd.DataFrame(data=df_train.card_id.values,columns=['card_id'])
df_oof_test = pd.DataFrame(data=df_test.card_id.values,columns=['card_id'])

def modelKFoldReg(df_train,df_test,model):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=50,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features],num_iteration=model.best_iteration_)
        oof_train_pred[val_index] = model.predict(x_val,num_iteration=model.best_iteration_)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred
lr_rate = [0.01,0.02,0.015,0.014,0.01,0.01]
random_state = [20,40,60,80,90,100]
reg_list = [5,9,10,15,20,25]
for i,(lr,rs,reg) in enumerate(zip(lr_rate,random_state,reg_list)):
    print('........lgb%s......'%i)
    lgb_params ={
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators':2000,
        'learning_rate': lr,
        'subsample': 0.9855,
        'max_depth': 7,
        'top_rate': 0.906,
        'num_leaves': 63,
        'min_child_weight': 41.9612,
        'other_rate': 0.072,
        'reg_alpha': 9.677,
        'colsample_bytree': 0.566,
        'min_split_gain': 8.820,
        'reg_lambda':reg,
        'min_data_in_leaf': 21,
        'verbose': -1,
        'seed':rs,
        'bagging_seed':42,
        'device': 'gpu',
        'gpu_platform_id':1,
        'gpu_device_id': 1,
    }

    lgb_est = lgb.LGBMRegressor(**lgb_params)
    lgb_est,score,lgb_test_pred,lgb_train_pred = modelKFoldReg(df_train,df_test,lgb_est)
    
    fea_importance = lgb_est.feature_importances_
    df_features = pd.DataFrame({'features':tr_features,'importance':fea_importance})
    df_features.sort_values(by=['importance'],ascending=False,inplace=True)
    
    for col in df_features[df_features.importance==0].features.tolist():
        tr_features.remove(col)
    
    df_oof_train['oof_lgb_%s_pred'%i] = lgb_train_pred
    df_oof_test['oof_lgb_%s_pred'%i] = lgb_test_pred
    print("score = %s"%score)
    del lgb_est
    gc.collect()

In [None]:
def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = KFold(n_splits=NFOLDS,shuffle=True,random_state=43)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train)):
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=50,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features],ntree_limit=model.best_ntree_limit)
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred

lr_rate = [0.001,0.002,0.005,0.01,0.02,0.05]
random_state = [20,40,60,80,90,100]
n_estimators = [5000,6000,7000,7500,5000,5000]

for i,(lr,rs,n_est) in enumerate(zip(lr_rate,random_state,n_estimators)):
    print('........xgb%s......'%i)
    xgb_params = {
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'learning_rate': lr,
        'n_estimators':n_est,
        'max_depth': 7,
        'gamma' : 1.45,
        'alpha': 0.0,
        'lambda': 0.0,
        'subsample': 0.9,
        'colsample_bytree': 0.054,
        'colsample_bylevel': 0.50,
        'random_state': rs
    }
    xgb_est = xgb.XGBRegressor(**xgb_params)
    xgb_est,score,xgb_test_pred,xgb_train_pred = modelKFoldReg(df_train,df_test,xgb_est)

    df_oof_train['oof_xgb_%s_pred'%i] = xgb_train_pred
    df_oof_test['oof_xgb_%s_pred'%i] = xgb_test_pred
    print("score = %s"%score)
    del xgb_est
    gc.collect()

In [None]:
def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],use_best_model=True,early_stopping_rounds=50,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features])
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred
lr_rate = [0.015,0.025,0.055,0.01,0.02,0.05]
random_state = [20,40,60,80,90,100]
reg_list = [20,25,30,35,40,45]
for i,(lr,rs,reg) in enumerate(zip(lr_rate,random_state,reg_list)):
    print('........cat%s......'%i)
    cat_params = {
        'n_estimators':5000,
        'learning_rate':lr,
        'max_depth':8,
        'loss_function':'RMSE',
        'eval_metric':'RMSE',
        'logging_level':'Verbose',
        'random_state':rs,
        'bagging_temperature':0.8,
        'l2_leaf_reg':reg,
        'od_type':'Iter',
        'thread_count':16
    }
    cat_est = catb.CatBoostRegressor(**cat_params)
    cat_est,score,cat_test_pred,cat_train_pred = modelKFoldReg(df_train,df_test,cat_est)
    df_oof_train['oof_cat_%s_pred'%i] = cat_train_pred
    df_oof_test['oof_cat_%s_pred'%i] = cat_test_pred
    print("score = %s"%score)
    del cat_est
    gc.collect()

In [None]:
# df_oof_train['target'] = df_train['target'].values
df_oof_train.shape,df_oof_test.shape

In [None]:
df_oof_test.head()

In [None]:
df_oof_train.to_csv('./datasets/stacking/level1/df_stack_oof_train.csc',index=False)
df_oof_test.to_csv('./datasets/stacking/level1/df_stack_oof_test.csv',index=False)

In [None]:
NFOLDS = 5
kfold = KFold(n_splits=NFOLDS,shuffle=True,random_state=42)
label = ['target']
tr_features = [f for f in df_oof_train.columns if f!='card_id' and f !='target']
ntrain = df_oof_train.shape[0]
ntest = df_oof_test.shape[0]
oof_train_pred = np.zeros((ntrain,))
oof_test_pred = np.zeros((ntest,))
oof_test_pred_skf = np.empty((NFOLDS, ntest))
for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_oof_train)):
    model = BayesianRidge()
    x_dev = df_oof_train[tr_features].iloc[dev_index]
    y_dev = df_oof_train[label].iloc[dev_index]
    x_val = df_oof_train[tr_features].iloc[val_index]
    y_val = df_oof_train[label].iloc[val_index]
    model.fit(x_dev.values,y_dev.values)
    oof_test_pred_skf[foldIndex,:] = model.predict(df_oof_test[tr_features].values).reshape(-1,)
    oof_train_pred[val_index] = model.predict(x_val.values).reshape(-1,)

oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
score = np.sqrt((np.sum(np.square(oof_train_pred - df_oof_train[label].values.reshape(-1,)))/ntrain))

df_sub = pd.DataFrame(data=df_oof_test['card_id'].values,columns=['card_id'])
df_sub['target'] = oof_test_pred
df_sub.to_csv('./submission/df_stacking_sub_%.4f.csv'%score,index=False)

print("score = %s"%score)