In [342]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.activations import relu, sigmoid
from keras import backend as K
from keras.utils.np_utils import to_categorical
from keras.regularizers import l2,l1

## View the Data

In [343]:
# Define the missing values and replace with NaN
missing_values = ["n.a.","NA","n/a", "na", "?"]
data = pd.read_csv('crx.data', na_values = missing_values, names=["A1", "A2","A3", "A4","A5", "A6","A7", "A8","A9", "A10","A11", "A12","A13", "A14","A15", "A16"])
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [344]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  A16     690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


## Data Cleaning

In [345]:
# A1, A4, A5, A6 and A7 fields are categorical have got NaN 
# They are replaced with the mode of the field
data['A1'].fillna(data['A1'].mode(), inplace=True)
data['A4'].fillna(data['A4'].mode(), inplace=True)
data['A5'].fillna(data['A5'].mode(), inplace=True)
data['A6'].fillna(data['A6'].mode(), inplace=True)
data['A7'].fillna(data['A7'].mode(), inplace=True)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [346]:
# A2 and A14 fields are nominal have got NaN  
# They are replaced with the mean of the field
data['A2'].fillna(float(data['A2'].mean()), inplace=True)
data['A14'].fillna(int(data['A14'].mean()), inplace=True)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [347]:
# Label encoding is applied to the data
# All the categorical fields are encoded to numerical values
for col in ["A1","A4","A5", "A6","A7","A8","A9","A10","A12","A13"]:
    data[col] = data[col].astype('category')
    data[col] = data[col].cat.codes
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,30.83,0.000,1,0,12,7,30,1,1,1,0,0,202.0,0,+
1,0,58.67,4.460,1,0,10,3,65,1,1,6,0,0,43.0,560,+
2,0,24.50,0.500,1,0,10,3,36,1,0,0,0,0,280.0,824,+
3,1,27.83,1.540,1,0,12,7,73,1,1,5,1,0,100.0,3,+
4,1,20.17,5.625,1,0,12,7,41,1,0,0,0,2,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.085,2,2,4,3,30,0,0,0,0,0,260.0,0,-
686,0,22.67,0.750,1,0,1,7,46,0,1,2,1,0,200.0,394,-
687,0,25.25,13.500,2,2,5,2,46,0,1,1,1,0,200.0,1,-
688,1,17.92,0.205,1,0,0,7,1,0,0,0,0,0,280.0,750,-


In [348]:
# The class output is converted to 1 and 0
data['A16'].replace(to_replace='+', value=1, inplace=True)
data['A16'].replace(to_replace='-', value=0, inplace=True)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,30.83,0.000,1,0,12,7,30,1,1,1,0,0,202.0,0,1
1,0,58.67,4.460,1,0,10,3,65,1,1,6,0,0,43.0,560,1
2,0,24.50,0.500,1,0,10,3,36,1,0,0,0,0,280.0,824,1
3,1,27.83,1.540,1,0,12,7,73,1,1,5,1,0,100.0,3,1
4,1,20.17,5.625,1,0,12,7,41,1,0,0,0,2,120.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.085,2,2,4,3,30,0,0,0,0,0,260.0,0,0
686,0,22.67,0.750,1,0,1,7,46,0,1,2,1,0,200.0,394,0
687,0,25.25,13.500,2,2,5,2,46,0,1,1,1,0,200.0,1,0
688,1,17.92,0.205,1,0,0,7,1,0,0,0,0,0,280.0,750,0


## Split the Data

In [349]:
Y = data['A16']
data = data.drop('A16', axis=1)
X = data

# Scale X values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X


array([[ 0.66649628, -0.0623209 , -0.95661321, ..., -0.31717105,
         0.10454595, -0.19541334],
       [-1.29881326,  2.28810134, -0.06005053, ..., -0.31717105,
        -0.81968741, -0.08785188],
       [-1.29881326, -0.59673802, -0.8561017 , ..., -0.31717105,
         0.55794344, -0.03714433],
       ...,
       [-1.29881326, -0.53341846,  1.7571976 , ..., -0.31717105,
         0.09292037, -0.19522126],
       [ 0.66649628, -1.15226167, -0.91540349, ..., -0.31717105,
         0.55794344, -0.05135781],
       [ 0.66649628,  0.28973588, -0.27816051, ..., -0.31717105,
        -1.06963731, -0.19541334]])

## Create Neural Network

In [350]:
# functions to calculate F1
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [351]:
# Calculate the average F1 value for 5 fold cross validation
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cvscores = []

for train, test in kfold.split(X, Y):
  # create model
  model = Sequential()
  model.add(Dense(15, input_dim=15, activation='relu'))
  model.add(Dense(6, activation='relu', kernel_regularizer=l1(0.05)))
  model.add(Dense(1, activation='softmax'))
	
  # Compile model
  model.compile(loss='mean_squared_error', optimizer='adam', metrics=[f1_m])
	
  # Fit the model
  X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
  model.fit(X_train, Y_train, epochs=50, batch_size=10, verbose=0)
	
  # evaluate the model
  scores = model.evaluate(X[test], Y[test], verbose=0)
  print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
  cvscores.append(scores[1])

print("Average F1 measure is ", (np.mean(cvscores)))

f1_m: 47.08%
f1_m: 47.67%
f1_m: 48.47%
f1_m: 47.56%
f1_m: 46.26%
Average F1 measure is  0.47410001158714293
