In [1]:
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Activation, Dropout, Flatten, Dense

from sklearn.model_selection import KFold
import tensorflow as tf

In [2]:
# Remove missing values
cleaned_data = ""
cleaned_p_count = 0
cleaned_n_count = 0

with open('Dataset/data.csv', 'r') as f:
    data = f.readlines()
    for i, row in enumerate(data):
        # Check for '?' value in each row (indicates missing)
        if '?' not in row:
            cleaned_data += row
            if '+' in row:
                cleaned_p_count += 1
            elif '-' in row:
                cleaned_n_count += 1

    print(cleaned_data)

with open('./dataset/crx_clean.data.txt', 'w') as f:
    f.write(cleaned_data)

with open('./dataset/crx_clean.names.txt', 'w') as f:
    f.write("Class Distribution\n")
    f.write("+ Classes: %d\n" %cleaned_p_count)
    f.write("- Classes: %d\n" %cleaned_n_count)

b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+
a,58.67,4.46,u,g,q,h,3.04,t,t,06,f,g,00043,560,+
a,24.50,0.5,u,g,q,h,1.5,t,f,0,f,g,00280,824,+
b,27.83,1.54,u,g,w,v,3.75,t,t,05,t,g,00100,3,+
b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
b,32.08,4,u,g,m,v,2.5,t,f,0,t,g,00360,0,+
b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,00164,31285,+
a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,00080,1349,+
b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,00180,314,+
b,42.50,4.915,y,p,w,v,3.165,t,f,0,t,g,00052,1442,+
b,22.08,0.83,u,g,c,h,2.165,f,f,0,t,g,00128,0,+
b,29.92,1.835,u,g,c,h,4.335,t,f,0,f,g,00260,200,+
a,38.25,6,u,g,k,v,1,t,f,0,t,g,00000,0,+
b,48.08,6.04,u,g,k,v,0.04,f,f,0,f,g,00000,2690,+
a,45.83,10.5,u,g,q,v,5,t,t,07,t,g,00000,0,+
b,36.67,4.415,y,p,k,v,0.25,t,t,10,t,g,00320,0,+
b,28.25,0.875,u,g,m,v,0.96,t,t,03,t,g,00396,0,+
a,23.25,5.875,u,g,q,v,3.17,t,t,10,f,g,00120,245,+
b,21.83,0.25,u,g,d,h,0.665,t,f,0,t,g,00000,0,+
a,19.17,8.585,u,g,cc,h,0.75,t,t,07,f,g,00096,0,+
b,25.00,11.25,u,g,c,v,2.5,t,t,17,f,g,00200,1208,+
b,23

In [3]:
def one_hot_encode_category(credit_data):
	"""
	Splits 'category' columns into one-hot columns
	arg, return
		credit_data: Dataframe
	"""
	cat_columns = []
	for i, _ in enumerate(credit_data):
		# dtype == 'object' after ensuring data has been cleaned
		# i.e no 'float' dtypes as 'object' because of '?' values
		if credit_data[i].dtype == 'object' and not i==15:
			cat_columns.append(i)


	# get_dummies() one-hot encodes data
	credit_data = pd.get_dummies(credit_data, columns=cat_columns)
	
	return credit_data

In [4]:
seed = 100

def import_data(url):
	"""
	args
		url: url string of CLEANED csv data
	returns
		credit_data: Dataframe
	"""

	credit_data = pd.read_csv(url, sep=',', header=None)

	# Bring class attribute to first column
	cols = credit_data.columns.tolist()
	cols = cols[-1:] + cols[:-1]
	credit_data = credit_data[cols]
	print("Reordered Dataset: \n", credit_data.head())

	credit_data = one_hot_encode_category(credit_data)
	print("Dataset length: ", len(credit_data))
	print("Dataset shape: ", credit_data.shape)
	print("One-hot Dataset: \n", credit_data.head())
	# print(credit_data.info())
	return credit_data

In [5]:
#Read filtered data and feature setup and split to feed

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
# Building Phase
data = import_data("dataset/crx_clean.data.txt")

X = data.values[:, 1:]
Y = data.values[:, 0]

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = seed)

#Float conversion
X_train = X_train.astype(np.float)
X_test = X_test.astype(np.float)

Y_train = np.where(Y_train=='+', 1, Y_train)
Y_train = np.where(Y_train=='-', 0, Y_train)
Y_train = Y_train.astype(np.float)

Y_test = np.where(Y_test=='+', 1, Y_test)
Y_test = np.where(Y_test=='-', 0, Y_test)
Y_test = Y_test.astype(np.float)

Reordered Dataset: 
   15 0      1      2  3  4  5  6     7  8  9   10 11 12   13   14
0  +  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  202    0
1  +  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g   43  560
2  +  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  280  824
3  +  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  100    3
4  +  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  120    0
Dataset length:  653
Dataset shape:  (653, 47)
One-hot Dataset: 
   15      1      2     7  10   13   14  0_a  0_b  3_l  ...  6_z  8_f  8_t  \
0  +  30.83  0.000  1.25   1  202    0    0    1    0  ...    0    0    1   
1  +  58.67  4.460  3.04   6   43  560    1    0    0  ...    0    0    1   
2  +  24.50  0.500  1.50   0  280  824    1    0    0  ...    0    0    1   
3  +  27.83  1.540  3.75   5  100    3    0    1    0  ...    0    0    1   
4  +  20.17  5.625  1.71   0  120    0    0    1    0  ...    0    0    1   

   9_f  9_t  11_f  11_t  12_g  12_p  12_s  
0    0    

In [6]:
# Merge inputs and targets
inputs = np.concatenate((X_train, X_test), axis=0)
targets = np.concatenate((Y_train, Y_test), axis=0)

In [7]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [12]:
def get_model(optimizerF, lossF, activationF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation=activationF))
    model.add(Dense(20, activation=activationF))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model

In [13]:
def evaluateModel(optimizerF, lossF, activationF):
    print(f'************************* {optimizerF} --- {lossF} --- {activationF}****************************')
    # Define per-fold score containers
    f1_per_fold = []
    loss_per_fold = []

    # Define the K-fold Cross Validator
    kfold = KFold(n_splits=5, shuffle=True)

    # K-fold Cross Validation model evaluation
    fold_no = 1
    for train, test in kfold.split(inputs, targets):

    # Define the model architecture
        model = get_model(optimizerF, lossF, activationF)
#         print('-------------------------------------------------------------------------------------------------')
#         print(f'Training for fold {fold_no} ...')

    # Fit data to model
        history = model.fit(inputs[train], targets[train],batch_size=10,epochs=50, verbose=0)

    # Generate generalization metrics
        scores = model.evaluate(inputs[test], targets[test], verbose=0)
#         print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]} {model.metrics_names[1]} of {scores[1]}')
        f1_per_fold.append(scores[1])
        loss_per_fold.append(scores[0])
    # Increase fold number
        fold_no = fold_no + 1

    # == Provide average scores ==
    print('------------------------------------------------------------------------')
    print('Score per fold')
    for i in range(0, len(f1_per_fold)):
        print('------------------------------------------------------------------------')
        print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - F1 score: {f1_per_fold[i]}')

    print('------------------------------------------------------------------------')
    print('F1 scores for all folds:')
    print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
    print(f'> Loss: {np.mean(loss_per_fold)}')
    print('------------------------------------------------------------------------')

In [14]:
optimzers = ['adam', 'SGD', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl']
lossFunctions = ['binary_crossentropy', 'mean_squared_error']
activationFunctions = ['relu', 'tanh']

In [16]:
for opti in optimzers:
    for los in lossFunctions:
        for act in activationFunctions:
            evaluateModel(opti,los,act)

************************* adam --- binary_crossentropy --- relu****************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 1.2762517929077148 - F1 score: 0.8335363268852234
------------------------------------------------------------------------
> Fold 2 - Loss: 1.1187858581542969 - F1 score: 0.8314062356948853
------------------------------------------------------------------------
> Fold 3 - Loss: 1.9337730407714844 - F1 score: 0.7446077466011047
------------------------------------------------------------------------
> Fold 4 - Loss: 0.8145887851715088 - F1 score: 0.753416121006012
------------------------------------------------------------------------
> Fold 5 - Loss: 5.829875469207764 - F1 score: 0.5926430821418762
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.7511219024658203

In [17]:
def get_model(optimizerF, lossF, activationF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation=activationF))
    model.add(Dense(10, activation=activationF))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model

In [18]:
for opti in optimzers:
    for los in lossFunctions:
        for act in activationFunctions:
            evaluateModel(opti,los,act)

************************* adam --- binary_crossentropy --- relu****************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.7526336908340454 - F1 score: 0.8317810297012329
------------------------------------------------------------------------
> Fold 2 - Loss: 0.37800106406211853 - F1 score: 0.6457648277282715
------------------------------------------------------------------------
> Fold 3 - Loss: 1.48969566822052 - F1 score: 0.6592065095901489
------------------------------------------------------------------------
> Fold 4 - Loss: 0.46871066093444824 - F1 score: 0.8255302309989929
------------------------------------------------------------------------
> Fold 5 - Loss: 5.904057502746582 - F1 score: 0.7098664045333862
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.734429800510406

In [19]:
def get_model(optimizerF, lossF, activationF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation=activationF))
    model.add(Dense(20, activation=activationF))
    model.add(Dense(10, activation=activationF))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model

In [20]:
for opti in optimzers:
    for los in lossFunctions:
        for act in activationFunctions:
            evaluateModel(opti,los,act)

************************* adam --- binary_crossentropy --- relu****************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.6937194466590881 - F1 score: 0.5532951354980469
------------------------------------------------------------------------
> Fold 2 - Loss: 0.7249612212181091 - F1 score: 0.751072883605957
------------------------------------------------------------------------
> Fold 3 - Loss: 0.5055830478668213 - F1 score: 0.8266666531562805
------------------------------------------------------------------------
> Fold 4 - Loss: 0.6486217975616455 - F1 score: 0.7351187467575073
------------------------------------------------------------------------
> Fold 5 - Loss: 0.3218729794025421 - F1 score: 0.6884210109710693
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.710914885997772

In [33]:
from keras.regularizers import l2,l1

def get_model(optimizerF, lossF, activationF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation='relu'))
    model.add(Dense(10, activation='relu', kernel_regularizer=l1(0.002), bias_regularizer=l1(0.002)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model
evaluateModel('Adamax','binary_crossentropy','relu')

************************* Adamax --- binary_crossentropy --- relu****************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.433227002620697 - F1 score: 0.8801926374435425
------------------------------------------------------------------------
> Fold 2 - Loss: 0.5351843237876892 - F1 score: 0.8434548377990723
------------------------------------------------------------------------
> Fold 3 - Loss: 0.6507638692855835 - F1 score: 0.5497072339057922
------------------------------------------------------------------------
> Fold 4 - Loss: 0.4321388006210327 - F1 score: 0.6746543645858765
------------------------------------------------------------------------
> Fold 5 - Loss: 0.3839220404624939 - F1 score: 0.8766530752182007
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.7649324297904