In [89]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import xgboost
import numpy as np
import scipy.stats as stats
import math
from sklearn.compose import ColumnTransformer

### Functions

In [95]:
def clean_up(df):
    df = df.drop(columns=['Id'])
    df.columns = [x.strip() for x in df.columns]
    nulls = pd.DataFrame(df.isnull().sum(), columns=['nulls']).reset_index().rename(columns={'index': 'column'})
    nulls = nulls[nulls['nulls'] > 0]
    null_columns = nulls['column'].tolist()

    for x in null_columns:
        df.loc[df[x].isnull(), x] = df[x].mean()

    return df


def split_train(df):
    x = df['Class']
    y = df.drop(columns=['Class'])
    return x, y


def data_transform(trimmed_df):
    log_cols = [
        'DI',
        'EE'
    ]

    sqrt_cols = [
        'AF'
    ]

    boxcox_cols = [
        'AB',
        'BQ',
        'DE',
        'EB',
        'FE',
        'GB'
    ]

    yeo_cols = [
        'AM',
        'GF',
        'CF'
    ]

    trimmed_df[log_cols] = np.log1p(trimmed_df[log_cols])
    trimmed_df[sqrt_cols] = np.sqrt(trimmed_df[sqrt_cols])

    for col in boxcox_cols:
        if trimmed_df[col].value_counts().iloc[0] != len(trimmed_df[col]):
            trimmed_df[col], lmbda = stats.boxcox(trimmed_df[col])

    for col in yeo_cols:
        if trimmed_df[col].value_counts().iloc[0] != len(trimmed_df[col]):
            trimmed_df[col], lmbda = stats.yeojohnson(trimmed_df[col])

    return trimmed_df


def get_used_cols(clean_df):
    if 'Class' in clean_df.columns:
        used_cols = [
            'AF',
            'AB',
            'BQ',
            'DI',
            'FL',
            'AM',
            'CR',
            'FE',
            'DH',
            'DA',
            'BN',
            'CD',
            'BP',
            'DL',
            'EE',
            'GF',
            'DE',
            'BD',
            'CF',
            'AX',
            'FI',
            'EB',
            'GB',
            'CU',
            'EJ',
            'Class']
    else:
        used_cols = [
            'AF',
            'AB',
            'BQ',
            'DI',
            'FL',
            'AM',
            'CR',
            'FE',
            'DH',
            'DA',
            'BN',
            'CD',
            'BP',
            'DL',
            'EE',
            'GF',
            'DE',
            'BD',
            'CF',
            'AX',
            'FI',
            'EB',
            'GB',
            'CU',
            'EJ']

    trimmed_df = clean_df[used_cols]

    return trimmed_df


def prep_train(train_df):
    train_df = clean_up(train_df)
    train_df = get_used_cols(train_df)

    for col in train_df.columns:
        if col not in ['Id', 'EJ', 'Class']:
            z_scores = stats.zscore(train_df[col])
            threshold = 3  # Adjust the threshold as per your requirement
            outliers = np.abs(z_scores) > threshold
            train_df = train_df[~outliers]

    train_df = data_transform(train_df)

    return train_df


def prep_test(test_df):
    test_df = clean_up(test_df)
    test_df = get_used_cols(test_df)
    test_df = data_transform(test_df)

    return test_df

### Ingestion

In [166]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

In [167]:
train = prep_train(train)
test = prep_test(test)

### Transformation

In [168]:
train = pd.get_dummies(train, columns=['EJ'])
X = train.drop(columns=['Class'])
y = train[['Class']]

In [169]:
scaler = StandardScaler()
columns_no_ej = [x for x in X.columns if x not in ['EJ_A', 'EJ_B']]
columns_no_ej

X[columns_no_ej] = scaler.fit_transform(X[columns_no_ej])

In [170]:
X_arr = np.asarray(X)
y_arr = np.asarray(y)

In [171]:
from imblearn.over_sampling import SMOTE

In [172]:
BalancerSMOTE = SMOTE(random_state = 42)
X_bal, y_bal = BalancerSMOTE.fit_resample(X_arr, y_arr)

In [173]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, random_state = 42, shuffle=False, test_size=0.3)

In [174]:
#Function
def train_transform(train):
    train = prep_train(train)
    train = pd.get_dummies(train, columns=['EJ'])
    X = train.drop(columns=['Class'])
    y = train[['Class']]

    scaler = StandardScaler()
    columns_no_ej = [x for x in X.columns if x not in ['EJ_A', 'EJ_B']]
    columns_no_ej

    X[columns_no_ej] = scaler.fit_transform(X[columns_no_ej])

    X_arr = np.asarray(X)
    y_arr = np.asarray(y)

    BalancerSMOTE = SMOTE(random_state = 42)
    X_bal, y_bal = BalancerSMOTE.fit_resample(X_arr, y_arr)

    X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, random_state = 42, shuffle=False, test_size=0.3)

    return X_train, X_test, y_train, y_test, scaler

# Model

In [175]:
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error
import joblib

In [179]:
mc = ModelCheckpoint(f'../Models/ARC_Neural_Network.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=7,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    #keras.layers.InputLayer(78),
    keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(25, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    #keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
    #keras.layers.BatchNormalization(),
    keras.layers.Dense(3, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
])

In [180]:
optimizer = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [181]:
history = model.fit(
    X_train,
    y_train,
    batch_size=2000,
    epochs=5000,
    validation_data=[X_test, y_test],
    callbacks=[mc, early_stopping],
    shuffle=True,
    use_multiprocessing=True
)

Epoch 1/5000
Epoch 1: val_loss improved from inf to 0.73957, saving model to ../Models\ARC_Neural_Network.h5
Epoch 2/5000
Epoch 2: val_loss improved from 0.73957 to 0.73504, saving model to ../Models\ARC_Neural_Network.h5
Epoch 3/5000
Epoch 3: val_loss improved from 0.73504 to 0.73057, saving model to ../Models\ARC_Neural_Network.h5
Epoch 4/5000
Epoch 4: val_loss improved from 0.73057 to 0.72615, saving model to ../Models\ARC_Neural_Network.h5
Epoch 5/5000
Epoch 5: val_loss improved from 0.72615 to 0.72178, saving model to ../Models\ARC_Neural_Network.h5
Epoch 6/5000
Epoch 6: val_loss improved from 0.72178 to 0.71744, saving model to ../Models\ARC_Neural_Network.h5
Epoch 7/5000
Epoch 7: val_loss improved from 0.71744 to 0.71313, saving model to ../Models\ARC_Neural_Network.h5
Epoch 8/5000
Epoch 8: val_loss improved from 0.71313 to 0.70883, saving model to ../Models\ARC_Neural_Network.h5
Epoch 9/5000
Epoch 9: val_loss improved from 0.70883 to 0.70455, saving model to ../Models\ARC_Neura

# Validation

In [182]:
model = load_model('../Models/ARC_Neural_Network.h5')

In [183]:
predictions = model.predict(X_bal)



In [184]:
from sklearn.metrics import accuracy_score, log_loss

In [185]:
log_loss(y_bal, predictions)

0.033440213773038324

array([[0.64772487],
       [0.07259709],
       [0.46718428],
       [0.26168272],
       [0.9087073 ],
       [0.43605796],
       [0.11021875],
       [0.38849226],
       [0.33570352],
       [0.17482528],
       [0.90757143],
       [0.173222  ],
       [0.6419201 ],
       [0.16996188],
       [0.2529988 ],
       [0.16579981],
       [0.31226107],
       [0.3851592 ],
       [0.42193222],
       [0.106465  ],
       [0.08993846],
       [0.22208735],
       [0.6027017 ],
       [0.10591279],
       [0.14798681],
       [0.27806956],
       [0.7093658 ],
       [0.13827676],
       [0.9865981 ],
       [0.15566711],
       [0.16249189],
       [0.15264407],
       [0.57556444],
       [0.11564979],
       [0.39172843],
       [0.9319625 ],
       [0.09228712],
       [0.20536338],
       [0.09257989],
       [0.6885323 ],
       [0.10066949],
       [0.08616605],
       [0.46981955],
       [0.3684416 ],
       [0.3319858 ],
       [0.43365237],
       [0.11039667],
       [0.458

In [186]:
joblib.dump(scaler, '../Models/Scaler.h5')

['../Models/Scaler.h5']

In [84]:
ids = test[['Ids']]

KeyError: "None of [Index(['Ids'], dtype='object')] are in the [columns]"

In [88]:
model.predict(X_bal)



array([[0.64772487],
       [0.07259709],
       [0.46718428],
       [0.26168272],
       [0.9087073 ],
       [0.43605796],
       [0.11021875],
       [0.38849226],
       [0.33570352],
       [0.17482528],
       [0.90757143],
       [0.173222  ],
       [0.6419201 ],
       [0.16996188],
       [0.2529988 ],
       [0.16579981],
       [0.31226107],
       [0.3851592 ],
       [0.42193222],
       [0.106465  ],
       [0.08993846],
       [0.22208735],
       [0.6027017 ],
       [0.10591279],
       [0.14798681],
       [0.27806956],
       [0.7093658 ],
       [0.13827676],
       [0.9865981 ],
       [0.15566711],
       [0.16249189],
       [0.15264407],
       [0.57556444],
       [0.11564979],
       [0.39172843],
       [0.9319625 ],
       [0.09228712],
       [0.20536338],
       [0.09257989],
       [0.6885323 ],
       [0.10066949],
       [0.08616605],
       [0.46981955],
       [0.3684416 ],
       [0.3319858 ],
       [0.43365237],
       [0.11039667],
       [0.458

In [202]:
output = pd.DataFrame(predictions, columns=['class_1'])
output['class_0'] = 1-output['class_1']

In [203]:
output

Unnamed: 0,class_1,class_0
0,0.984657,0.015343
1,0.031510,0.968490
2,0.031805,0.968195
3,0.030841,0.969159
4,0.984114,0.015886
...,...,...
789,0.986072,0.013928
790,0.988224,0.011776
791,0.982946,0.017054
792,0.986846,0.013154


In [204]:
pd.merge(X, y, left_index=True, right_index=True)

Unnamed: 0,AF,AB,BQ,DI,FL,AM,CR,FE,DH,DA,...,BD,CF,AX,FI,EB,GB,CU,EJ_A,EJ_B,Class
0,0.065079,-0.959761,0.955569,-0.916492,0.691621,0.204464,-3.166089,0.429328,-0.831401,0.989511,...,-0.719886,-0.688688,-2.588895,-2.550514,-0.133199,-1.128985,-0.091060,0,1,1
1,-1.599695,-1.693387,-1.306626,-0.320354,-0.777574,0.837280,1.840596,-0.071049,-0.045764,1.086831,...,0.250855,-0.426218,-0.893044,0.156785,-1.397087,-1.545807,0.023960,1,0,0
2,-0.235954,0.606220,1.407821,-0.091423,0.776453,0.677333,-0.152624,0.293653,-1.562856,1.088042,...,-0.004980,-0.572777,0.900152,0.663440,0.069282,1.813755,-0.700666,0,1,0
3,0.476256,-0.593210,-1.521334,0.333462,0.449124,1.620987,-0.459521,0.750566,-0.750128,-0.248026,...,-0.689361,-1.371796,-0.862304,1.952106,-0.095955,-0.010441,-1.298771,0,1,0
4,0.428323,0.203467,0.931770,-0.658686,0.867904,-0.454474,-0.187002,1.351158,-1.589947,1.272225,...,0.415065,2.259911,-0.713725,1.478494,-0.110203,-0.299783,-1.080233,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,0.716308,1.004387,0.454542,1.598094,-0.777574,1.017479,-3.166089,1.194588,1.850600,-0.382668,...,0.408415,2.519347,0.674722,-2.550514,1.951503,-0.181081,0.230997,1,0,0
611,-0.254448,-1.315522,-0.637487,0.140350,-0.777574,-1.568162,0.112375,-1.483187,-0.330219,-0.192871,...,1.143805,0.266793,-0.662490,-0.283784,-0.282741,-0.228800,1.582483,1,0,0
612,0.077883,-1.634595,-0.807600,0.991171,-0.777574,-1.071394,-0.162651,1.435850,0.766964,-1.696058,...,-0.697857,-0.705027,-0.969895,-0.034862,0.142840,-1.618400,-0.574144,1,0,0
613,1.299493,0.463267,2.019258,1.227491,1.294767,1.100703,0.137084,1.540026,0.685691,-0.439035,...,0.362177,-0.310287,0.464661,0.377070,0.461536,1.721621,1.668748,0,1,0
