In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from dateutil.parser import parse
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential, load_model, save_model, Model
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf


In [2]:
random_state = 42

In [3]:
feature_set =  'feature_set_normalised'
X_train = pd.read_csv( F'./data/{feature_set}/X_train_full.csv')
y_train = pd.read_csv( F'./data/{feature_set}/y_train.csv').values.ravel()

X_val = pd.read_csv( F'./data/{feature_set}/X_valid_full.csv')
y_val = pd.read_csv( F'./data/{feature_set}/y_valid.csv').values.ravel()

In [4]:
# # outputing y value before over sampling.
counter = Counter(y_train)
print("Before Sampling: {}".format(counter))

# # Oversample using Adaptive Synthetic (ADASYN) algorithm.
# sm = ADASYN()
# # esample the dataset.
# xtrain_sm, ytrain_sm = sm.fit_resample(X_train, y_train)

# # outputing y value after over sampling.
# counter = Counter(ytrain_sm)
# print("After Sampling: {}".format(counter))

Before Sampling: Counter({0.0: 727216, 1.0: 19291})


In [5]:
pos = Counter(y_train).get(1)
neg = Counter(y_train).get(0)
total = neg+pos

In [6]:
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.51
Weight for class 1: 19.35


In [7]:
tf.random.set_seed(random_state)

# building keras model
def build_model():
    # Sequential groups a linear stack of layers
    model = Sequential([
        Dense(32, activation="relu", input_shape=(121,)),
        Dropout(0.1),
        Dense(64, activation="relu"),
        Dropout(0.2),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.2),
        Dense(1, activation="sigmoid")
    ])
    return model
model = build_model()

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    metrics=['accuracy', tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()]
)

In [8]:
history = model.fit(
    X_train, y_train, epochs=10, verbose=1, validation_split=0.1,class_weight=class_weight,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,
                                               verbose=1, restore_best_weights=True)]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
roc_auc_score(y_train,model.predict(X_train)[:, 0])



0.5

In [11]:
roc_auc_score(y_val,model.predict(X_val)[:, 0])



0.5

In [None]:
model.evaluate(X_val, y_val, verbose=1)



[0.22315308451652527,
 0.943131685256958,
 0.03479320928454399,
 0.044844675809144974]

In [None]:
model.evaluate(xtrain_sm, ytrain_sm, verbose=1)



[0.14457492530345917,
 0.9442444443702698,
 0.973075807094574,
 0.9141702055931091]

In [None]:
roc_auc_score(xtrain_sm,ytrain_sm)

ValueError: continuous-multioutput format is not supported

In [None]:
val_pred = model.predict(X_val)



In [None]:
roc_auc_score(y_val,val_pred)

0.5163811079247982

In [None]:
roc_auc_score(ytrain_sm,model.predict(xtrain_sm))



0.9870597612238718