In [1]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook, tnrange
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge,LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import xgboost as xgb
import catboost as catb
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

In [3]:
df_cate_statics = pd.read_csv('./datasets/df_cate_statics.csv')
df_data = df_data.merge(df_cate_statics,on='card_id',how='left')

In [4]:
df_cate_merchantCate_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchantCate_fm_feat.csv')
df_cate_merchant_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchant_fm_feat.csv')

df_data = df_data.merge(df_cate_merchantCate_fm,on='card_id',how='left')
df_data = df_data.merge(df_cate_merchant_fm,on='card_id',how='left')

In [5]:
df_card_merchant_statics = pd.read_csv(DATA_PATH+'df_card_merchant_statics.csv')
df_card_merchant_vec = pd.read_csv(DATA_PATH+'df_card_merchant_vec.csv')
df_card_city_statics = pd.read_csv(DATA_PATH+'df_card_city_statics.csv')
df_card_merchant_features = pd.read_csv(DATA_PATH+'df_card_merchant_features.csv')

df_data = df_data.merge(df_card_merchant_features,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_vec,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_statics,on='card_id',how='left')
df_data = df_data.merge(df_card_city_statics,on='card_id',how='left')

In [6]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
label = ['target']
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']

In [7]:
from feature_selector import FeatureSelector

fs = FeatureSelector(data = df_train[tr_features], labels = df_train['target'].values)

fs.identify_missing(missing_threshold = 0.12)
fs.identify_collinear(correlation_threshold = 0.98)
fs.identify_zero_importance(task = 'regression', eval_metric = 'rmse', n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.99)
fs.identify_single_unique()

tr_removed = fs.remove(methods = 'all')

tr_features = list(tr_removed.columns)

len(tr_features)

14 features with greater than 0.12 missing values.

609 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[497]	valid_0's rmse: 3.67051	valid_0's l2: 13.4727
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[544]	valid_0's rmse: 3.61328	valid_0's l2: 13.0558
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[726]	valid_0's rmse: 3.55964	valid_0's l2: 12.6711
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[528]	valid_0's rmse: 3.40321	valid_0's l2: 11.5818
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[327]	valid_0's rmse: 3.57161	valid_0's l2: 12.7564
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[573

1167

In [None]:
%%time

def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=20)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],use_best_model=True,early_stopping_rounds=100,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features])
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred
    
cat_params = {
    'n_estimators':5000,
    'learning_rate':0.01,
    'max_depth':8,
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
    'logging_level':'Verbose',
    'random_state':40,
    'bagging_temperature':0.8,
    'l2_leaf_reg':45,
    'od_type':'Iter',
    'thread_count':16
}

cat_est = catb.CatBoostRegressor(**cat_params)
cat_est,score,oof_test_pred,oof_train_pred = modelKFoldReg(df_train,df_test,cat_est)
df_test['target'] = oof_test_pred

df_sub = df_test[['card_id','target']]
df_sub.to_csv('./submission/df_cat_sub_%.5f.csv'%score,index=None)

0:	learn: 3.8663258	test: 3.8663258	test1: 3.8737911	best: 3.8737911 (0)	total: 207ms	remaining: 17m 13s
100:	learn: 3.7121363	test: 3.7121363	test1: 3.7311519	best: 3.7311519 (100)	total: 20.4s	remaining: 16m 29s
200:	learn: 3.6603960	test: 3.6603960	test1: 3.6945555	best: 3.6945555 (200)	total: 39.5s	remaining: 15m 43s
300:	learn: 3.6361812	test: 3.6361812	test1: 3.6815714	best: 3.6815712 (299)	total: 57s	remaining: 14m 50s
400:	learn: 3.6232404	test: 3.6232404	test1: 3.6759651	best: 3.6759651 (400)	total: 1m 13s	remaining: 14m 7s
500:	learn: 3.6144788	test: 3.6144788	test1: 3.6728279	best: 3.6728279 (500)	total: 1m 29s	remaining: 13m 27s
600:	learn: 3.6083687	test: 3.6083687	test1: 3.6710524	best: 3.6710510 (598)	total: 1m 45s	remaining: 12m 52s
700:	learn: 3.6030915	test: 3.6030915	test1: 3.6698198	best: 3.6698198 (700)	total: 2m 1s	remaining: 12m 22s
800:	learn: 3.5975839	test: 3.5975839	test1: 3.6687100	best: 3.6687008 (798)	total: 2m 16s	remaining: 11m 55s
900:	learn: 3.5929877	

In [None]:
print("score = %s"%score)

In [10]:
df_train['oof_cat_pred_%.5f'%score] = oof_train_pred
df_test['oof_cat_pred_%.5f'%score] = oof_test_pred
df_train[['card_id','oof_cat_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_cat_train_pred_%.5f.csv'%score,index=False)
df_test[['card_id','oof_cat_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_cat_test_pred_%.5f.csv'%score,index=False)

df_train.drop(columns=['oof_cat_pred_%.5f'%score],inplace=True)
df_test.drop(columns=['oof_cat_pred_%.5f'%score],inplace=True)

In [11]:
fea_importance = cat_est.feature_importances_
df_features = pd.DataFrame({'features':tr_features,'importance':fea_importance})
df_features.sort_values(by=['importance'],ascending=False,inplace=True)

In [13]:
%%time

def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)

    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=50,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features])
        oof_train_pred[val_index] = model.predict(x_val)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred
    
xgb_params = {
    'objective': 'reg:linear',
    'booster': 'gbtree',
    'learning_rate': 0.01,
    'n_estimators':5000,
    'max_depth': 8,
    'gamma' : 1.45,
    'alpha': 0.1,
    'lambda': 0.3,
    'subsample': 0.9,
    'colsample_bytree': 0.054,
    'colsample_bylevel': 0.50,
    'random_state': 2018
}
xgb_est = xgb.XGBRegressor(**xgb_params)
xgb_est,score,oof_test_pred,oof_train_pred = modelKFoldReg(df_train,df_test,xgb_est)

df_test['target'] = oof_test_pred

df_sub = df_test[['card_id','target']]
df_sub.to_csv('./submission/df_xgb_sub_%.5f.csv'%score,index=None)

[0]	validation_0-rmse:3.94316	validation_1-rmse:3.95417
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 50 rounds.
[100]	validation_0-rmse:3.41669	validation_1-rmse:3.74607
[200]	validation_0-rmse:3.14076	validation_1-rmse:3.69413
[300]	validation_0-rmse:2.96198	validation_1-rmse:3.67671
[400]	validation_0-rmse:2.82163	validation_1-rmse:3.66721
[500]	validation_0-rmse:2.72397	validation_1-rmse:3.66279
[600]	validation_0-rmse:2.64287	validation_1-rmse:3.661
[700]	validation_0-rmse:2.57017	validation_1-rmse:3.65995
[800]	validation_0-rmse:2.50472	validation_1-rmse:3.65899
[900]	validation_0-rmse:2.44455	validation_1-rmse:3.65838
[1000]	validation_0-rmse:2.39148	validation_1-rmse:3.65796
[1100]	validation_0-rmse:2.33578	validation_1-rmse:3.65729
[1200]	validation_0-rmse:2.2879	validation_1-rmse:3.65727
Stopping. Best iteration:
[1247]	validation_0-rmse:2.26605	validation_1-rmse:3.65695

[0]

In [14]:
print("score = %s"%score)

score = 3.656710827331684


In [15]:
df_train['oof_xgb_pred_%.5f'%score] = oof_train_pred
df_test['oof_xgb_pred_%.5f'%score] = oof_test_pred
df_train[['card_id','oof_xgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_xgb_train_pred_%.5f.csv'%score,index=False)
df_test[['card_id','oof_xgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_xgb_test_pred_%.5f.csv'%score,index=False)

df_train.drop(columns=['oof_xgb_pred_%.5f'%score],inplace=True)
df_test.drop(columns=['oof_xgb_pred_%.5f'%score],inplace=True)