In [None]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Dropout,BatchNormalization
from keras.optimizers import RMSprop 
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from tqdm import tqdm_notebook, tnrange
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge,LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import keras
import datetime
import gc
import os
DATA_PATH = './datasets/'

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')
df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [None]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

In [None]:
scaler = MinMaxScaler()
scaler.fit(df_data[tr_features].values)

X_train = scaler.transform(df_train[tr_features].values)
y_train = df_train[label].values

X_test = scaler.transform(df_test[tr_features].values)

del df_data
gc.collect()

In [None]:
X_train.shape,y_train.shape,X_test.shape

In [None]:
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler,EarlyStopping

batch_size = 5120
epochs = 100
def modelKFoldReg(X_train,y_train,X_test,model):
    NFOLDS = 5
    kfold = KFold(n_splits=NFOLDS,shuffle=False,random_state=2018)

    ntrain = X_train.shape[0]
    ntest = X_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(X_train)):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = X_train[dev_index]
        y_dev = y_train[dev_index]
        x_val = X_train[val_index]
        y_val = y_train[val_index]
        model.fit(x_dev, y_dev,batch_size=batch_size,
                  epochs=epochs,
                  verbose=1,
                  validation_data=(x_val, y_val),
                  callbacks=[earlyStopping]
                 )
        oof_test_pred_skf[foldIndex,:] = model.predict(X_test).reshape(-1,)
        oof_train_pred[val_index] = model.predict(x_val).reshape(-1,)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
    score = np.sqrt((np.sum(np.square(oof_train_pred - y_train.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred

def nn_model():
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Dense(10, activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Dense(1))
    model.compile(loss='mse',optimizer=RMSprop(),metrics=['mse'])
    
    return model

earlyStopping = EarlyStopping(monitor='val_loss',patience=10,verbose=1)

model = nn_model()
model,score,oof_test_pred,oof_train_pred = modelKFoldReg(X_train,y_train,X_test,model)

print("score = %s"%score)

In [None]:
df_train['oof_nn_pred_%.5f'%score] = oof_train_pred
df_test['oof_nn_pred_%.5f'%score] = oof_test_pred

df_train[['card_id','oof_nn_pred_%.5f'%score]].to_csv('./datasets/stacking/df_nn_train_pred_%.5f.csv'%score,index=False)
df_test[['card_id','oof_nn_pred_%.5f'%score]].to_csv('./datasets/stacking/df_nn_test_pred_%.5f.csv'%score,index=False)