In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from dateutil.parser import parse
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential, load_model, save_model, Model
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf


In [2]:
random_state = 42

In [3]:
feature_set =  'feature_set_2_normalised'
X_train = pd.read_csv( F'./data/{feature_set}/X_train_full.csv')
y_train = pd.read_csv( F'./data/{feature_set}/y_train.csv').values.ravel()

X_val = pd.read_csv( F'./data/{feature_set}/X_valid_full.csv')
y_val = pd.read_csv( F'./data/{feature_set}/y_valid.csv').values.ravel()

In [4]:
# # outputing y value before over sampling.
counter = Counter(y_train)
print("Before Sampling: {}".format(counter))

# # Oversample using Adaptive Synthetic (ADASYN) algorithm.
# sm = ADASYN()
# # esample the dataset.
# xtrain_sm, ytrain_sm = sm.fit_resample(X_train, y_train)

# # outputing y value after over sampling.
# counter = Counter(ytrain_sm)
# print("After Sampling: {}".format(counter))

Before Sampling: Counter({0.0: 727158, 1.0: 19349})


In [5]:
pos = Counter(y_train).get(1)
neg = Counter(y_train).get(0)
total = neg+pos

In [6]:
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.51
Weight for class 1: 19.29


In [7]:
tf.random.set_seed(random_state)

# building keras model
def build_model():
    # Sequential groups a linear stack of layers
    model = Sequential([
        Dense(128, activation="relu", input_shape=(122,)),
        Dropout(0.1),
        Dense(512, activation="relu"),
        Dropout(0.2),
        Dense(128, activation="relu"),
        Dense(64, activation="relu"),
        # Dropout(0.2),
        Dense(1, activation="sigmoid")
    ])
    return model
model = build_model()

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy', tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()]
)

In [8]:
# X_train['TX_AMOUNT'] = np.asarray(X_train['TX_AMOUNT']).astype(np.float32)
# X_train['TRANSACTION_GOODS_AND_SERVICES_AMOUNT'] = np.asarray(X_train['TRANSACTION_GOODS_AND_SERVICES_AMOUNT']).astype(np.float32)
# X_train['TRANSACTION_CASHBACK_AMOUNT'] = np.asarray(X_train['TRANSACTION_CASHBACK_AMOUNT']).astype(np.float32)
# X_train['x_terminal_id'] = np.asarray(X_train['x_terminal_id']).astype(np.float32)
# X_train['y_terminal__id'] = np.asarray(X_train['y_terminal__id']).astype(np.float32)
# X_train['x_customer_id'] = np.asarray(X_train['x_customer_id']).astype(np.float32)
# X_train['y_customer_id'] = np.asarray(X_train['y_customer_id']).astype(np.float32)

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               15744     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               66048     
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               65664     
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 6

In [10]:
history = model.fit(
    X_train, y_train, epochs=100, verbose=1, validation_split=0.1,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,
                                               verbose=1, restore_best_weights=True)]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

In [None]:
train_pred = model.predict(X_train)



In [None]:
roc_auc_score(y_train,train_pred)

0.6452864137441614

In [None]:
roc_auc_score(y_val,model.predict(X_val))



0.5853634204633672