In [1]:
from tqdm import tqdm_notebook, tnrange
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from h2o.automl import H2OAutoML
from feature_selector import FeatureSelector
from h2o.estimators.xgboost import H2OXGBoostEstimator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")

In [2]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

In [3]:
df_card_merchant_statics = pd.read_csv(DATA_PATH+'df_card_merchant_statics.csv')
df_card_merchant_vec = pd.read_csv(DATA_PATH+'df_card_merchant_vec.csv')
df_card_city_statics = pd.read_csv(DATA_PATH+'df_card_city_statics.csv')

df_data = df_data.merge(df_card_merchant_vec,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_statics,on='card_id',how='left')
df_data = df_data.merge(df_card_city_statics,on='card_id',how='left')

In [4]:
df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [5]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

1505


In [None]:
import lightgbm as lgb

fs = FeatureSelector(data = df_train[tr_features], labels = df_train['target'].values)

fs.identify_missing(missing_threshold = 0.12)
fs.identify_collinear(correlation_threshold = 0.98)
fs.identify_zero_importance(task = 'regression', eval_metric = 'rmse', n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.99)
fs.identify_single_unique()

tr_removed = fs.remove(methods = 'all')

tr_features = list(tr_removed.columns)

len(tr_features)

0 features with greater than 0.12 missing values.

464 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[518]	valid_0's rmse: 3.74088	valid_0's l2: 13.9942
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[510]	valid_0's rmse: 3.7556	valid_0's l2: 14.1045
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[387]	valid_0's rmse: 3.75902	valid_0's l2: 14.1302
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[376]	valid_0's rmse: 3.63568	valid_0's l2: 13.2182
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[433]	valid_0's rmse: 3.75562	valid_0's l2: 14.1047
Training until validation scores don't improve for 100 rounds.


In [None]:
%%time

def modelKFoldReg(df_train,df_test,model):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index].values
        y_dev = df_train[label].iloc[dev_index].values
        x_val = df_train[tr_features].iloc[val_index].values
        y_val = df_train[label].iloc[val_index].values
        model.fit(x_dev, y_dev)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features].values).reshape(-1,)
        oof_train_pred[val_index] = model.predict(x_val).reshape(-1,)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
        
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return score,oof_test_pred,oof_train_pred

extr_params = {
    'n_estimators':5000,
    'max_depth':9,
    'criterion':'mse',
    'min_samples_split':10,
    'min_samples_leaf':10,
    'bootstrap':True,
    'oob_score':True,
    'verbose':1,
    'n_jobs':20,
    'random_state':34
}
model = ExtraTreesRegressor(**extr_params)
score,oof_test_pred,oof_train_pred = modelKFoldReg(df_train,df_test,model)

In [None]:
print("score = %s"%score)

In [None]:
df_train['oof_extr_pred_%.5f'%score] = oof_train_pred
df_test['oof_extr_pred_%.5f'%score] = oof_test_pred
df_train[['card_id','oof_extr_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_extr_train_pred_%.5f.csv'%score,index=False)
df_test[['card_id','oof_extr_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_extr_test_pred_%.5f.csv'%score,index=False)

df_train.drop(columns=['oof_extr_pred_%.5f'%score],inplace=True)
df_test.drop(columns=['oof_extr_pred_%.5f'%score],inplace=True)