In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
train_data = pd.read_csv('data/dataTrain.csv')
test_data = pd.read_csv('data/dataA.csv')
submission = pd.read_csv('data/submit_example_A.csv')
data_nolabel = pd.read_csv('data/dataNoLabel.csv')
print(f'train_data.shape = {train_data.shape}')
print(f'test_data.shape  = {test_data.shape}')
print(f'data_nolabel.shape  = {data_nolabel.shape}')

train_data.shape = (59872, 48)
test_data.shape  = (49858, 47)
data_nolabel.shape  = (39884, 47)


In [3]:
train_data['f47'] = train_data['f1'] * 10 + train_data['f2']
test_data['f47'] = test_data['f1'] * 10 + test_data['f2']

In [4]:
test_data['label'] = -1
data = pd.concat([train_data, test_data])

In [5]:
features = [f'f{idx}' for idx in range(1, 48)]
feature_map = {}
cnt = 0
for feature in features:
    feature_unique = data[feature].unique()
    d = dict(zip(feature_unique, range(1, len(feature_unique) + 1)))
    feature_map[feature] = d

for feature in features:
    train_data[feature] = train_data[feature].map(feature_map[feature])
    test_data[feature] = test_data[feature].map(feature_map[feature])

In [12]:
train = train_data[features][:50000]
label = train_data['label'][:50000]
test = test_data[features]
print("train_data.shape = ", train.shape)
print("train_label.shape = ", label.shape)
print("test_data.shape = ", test.shape)

train_data.shape =  (50000, 47)
train_label.shape =  (50000,)
test_data.shape =  (49858, 47)


In [7]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Flatten, Input, Embedding, BatchNormalization
import warnings
warnings.filterwarnings("ignore")

In [8]:
def create_model(features, feature_map): 
    inputs = []
    outputs = []
    for feature in features:
        num = len(feature_map[feature])
        embed_dim = int(min(np.ceil((num)/2), 10))
        inp = layers.Input(shape=(1,))
#         out = layers.Embedding(num + 1, embed_dim, name=feature)(inp)
#         out = layers.Reshape(target_shape=(embed_dim, ))(out)
        out = inp
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(500, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

In [9]:
model = create_model(features, feature_map)

In [10]:
test.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47
0,2,1,1,1,1,1,2,1,1,2,...,1,1,1,1,1,1,8,44,49,2
1,1,1,1,1,1,1,2,1,1,2,...,1,1,1,1,1,5,13,90,293,1
2,1,1,2,1,1,1,2,1,1,2,...,1,1,1,1,1,1,1,57,72,1
3,1,1,1,1,1,1,2,1,1,2,...,1,1,1,1,1,1,1,11,4,1
4,2,1,3,1,1,1,2,1,1,2,...,1,1,1,1,1,1,1,35,66,2


In [13]:
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))

test = [test[col] for col in features]

skf = StratifiedKFold(n_splits=10)
k = 0
for train_index, test_index in skf.split(train, label):
    K.clear_session()
    x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
    y_train, y_test = label.iloc[train_index], label.iloc[test_index]

    
    x_train = [x_train[col] for col in features]
    x_test = [x_test[col] for col in features]
    
    model = create_model(features, feature_map)
    adam = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[tf.keras.metrics.AUC()])
    lr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.1,
                                     patience=3, min_lr=1e-6, mode='max', verbose=1)
    es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)
    model.fit(x_train, y_train,
              validation_data=(x_test, y_test),
              verbose=1,
              batch_size=2048,
              callbacks=[lr, es],
              epochs=50
             )
    valid_fold_preds = model.predict(x_test)
    test_fold_preds = model.predict(test)
    oof_preds[test_index] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    print("KFold = %d, auc = %.4f" % (k, roc_auc_score(y_test, valid_fold_preds)))
    k += 1
    K.clear_session()
print("Overall AUC={}".format(roc_auc_score(label.values, oof_preds)))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 19: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 20/50
Epoch 21/50
Epoch 21: early stopping
KFold = 0, auc = 0.8821
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 16: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 17/50
Epoch 18/50
Epoch 18: early stopping
KFold = 1, auc = 0.8868
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 17: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 18/50
Epoch 19/50
Epoch 19: early stopping
KFold = 2, auc = 0.8833
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 29: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Restoring model weights from the end of the best epoch: 24.
Epoch 29: early stopping
KFold = 3, auc = 0.8840
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 23: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 24/50
Epoch 25/50
Epoch 25: early stopping
KFold = 4, auc = 0.8804
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 24: early stopping
KFold = 5, auc = 0.8787
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 23: early stopping
KFold = 6, auc = 0.8746
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 18: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Restoring model weights from the end of the best epoch: 13.
Epoch 18: early stopping
KFold = 7, auc = 0.8878
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50


Epoch 22: early stopping
KFold = 8, auc = 0.8888
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 16: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 17/50
Epoch 18/50
Epoch 18: early stopping
KFold = 9, auc = 0.8791
Overall AUC=0.8766931912903252


In [12]:
print("Overall AUC={}".format(roc_auc_score(label.values, oof_preds)))

Overall AUC=0.770485921874636
