In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix , classification_report

import matplotlib.pyplot as plt


# import data
dataset = pd.read_csv('D:\\cleanedData.csv')
print("Dataset Loaded ...")
print(dataset.shape)


#Split dataset into training and test 
train, test = train_test_split(dataset, test_size=0.2, shuffle=True)

xtrain = train.iloc[:, 0:-1].values
ytrain = train.iloc[:, -1].values
xtest  = test.iloc[:, 0:-1].values
ytest  = test.iloc[:, -1].values

##Data Scalling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
xtrain = sc.fit_transform(xtrain)
xtest  = sc.transform(xtest)

print("Training and Testing data are Scaled ..")


Dataset Loaded ...
(1013, 13)
Training and Testing data are Scaled ..


In [16]:
from sklearn.linear_model import LogisticRegression


## Logistic Regression 
LogReg = LogisticRegression()
LogReg.fit(xtrain, ytrain)
y_pred=LogReg.predict(xtest)
print('Logistic Regression:')
print(confusion_matrix(ytest, y_pred))
print(classification_report(ytest, y_pred))
# [[223  10]
# [ 33  46]]
print('------------------------------------')

## Weighted logistic Regression 
w = {0:0.20, 1:0.80}
#lg2 = LogisticRegression(random_state=13, class_weight=w)
lg2 = LogisticRegression(class_weight=w)
lg2.fit(xtrain, ytrain)
y_pred = lg2.predict(xtest)

print('Weighted Logistic Regression:')
print(confusion_matrix(ytest, y_pred))
print(classification_report(ytest, y_pred))
print('------------------------------------')


lg3 = LogisticRegression( C=3.787878787878788, multi_class= 'ovr', penalty= 'l1', random_state= 3, solver= 'liblinear')
lg3.fit(xtrain, ytrain)
y_pred = lg3.predict(xtest)

print('best Logistic Regression:')
print(confusion_matrix(ytest, y_pred))
print(classification_report(ytest, y_pred))

print('------------------------------------')


Logistic Regression:
[[153   6]
 [ 14  30]]
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       159
           1       0.83      0.68      0.75        44

    accuracy                           0.90       203
   macro avg       0.87      0.82      0.84       203
weighted avg       0.90      0.90      0.90       203

------------------------------------
Weighted Logistic Regression:
[[130  29]
 [  5  39]]
              precision    recall  f1-score   support

           0       0.96      0.82      0.88       159
           1       0.57      0.89      0.70        44

    accuracy                           0.83       203
   macro avg       0.77      0.85      0.79       203
weighted avg       0.88      0.83      0.84       203

------------------------------------
best Logistic Regression:
[[151   8]
 [ 13  31]]
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       159
           1     



In [11]:
## Decision trees and Random Forests

from sklearn.tree import DecisionTreeClassifier

      
## Create gini Decision tree
DecTree1 = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=10, min_samples_leaf=8)
DecTree1.fit(xtrain, ytrain)
y_pred1 = DecTree1.predict(xtest)
print("Gini Decision Tree: ")
print(confusion_matrix(ytest, y_pred1))
print(classification_report(ytest, y_pred1))
print('------------------------------------')

        
## Decision tree with entropy

## Best DecTree parameters 
##DecTree2 = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 10, min_samples_leaf = 8)
## [[217  16]
## [ 31  48]]
DecTree2 = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 8, min_samples_leaf = 8)
DecTree2.fit(xtrain, ytrain)
y_pred2 = DecTree2.predict(xtest)
print("Entropy Decision Tree: ")
print(confusion_matrix(ytest, y_pred2))
print(classification_report(ytest, y_pred2))
print('------------------------------------')


## Random Forest classifier 
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=30, max_depth=7, max_features='log2', random_state=3, n_jobs=2)
#forest = RandomForestClassifier(criterion='gini', n_estimators=20, random_state=4, n_jobs=8)
#[[227  12]
# [ 42  31]]
forest.fit(xtrain, ytrain)
y_pred3 = forest.predict(xtest)
print("Random Forest: ")
print(confusion_matrix(ytest, y_pred3))
print(classification_report(ytest, y_pred3))
print('------------------------------------')


Gini Decision Tree: 
[[144   9]
 [ 27  23]]
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       153
           1       0.72      0.46      0.56        50

    accuracy                           0.82       203
   macro avg       0.78      0.70      0.72       203
weighted avg       0.81      0.82      0.81       203

------------------------------------
Entropy Decision Tree: 
[[141  12]
 [ 25  25]]
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       153
           1       0.68      0.50      0.57        50

    accuracy                           0.82       203
   macro avg       0.76      0.71      0.73       203
weighted avg       0.81      0.82      0.81       203

------------------------------------
Random Forest: 
[[148   5]
 [ 28  22]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       153
           1       0.81      0.44

In [14]:
## Discriminative Analysis 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


DiscModel = LinearDiscriminantAnalysis(solver='lsqr', shrinkage= 0.0)
DiscModel.fit(xtrain, ytrain)
y_pred = DiscModel.predict(xtest)
print("Discriminative Analysis: ")
print(confusion_matrix(ytest, y_pred))
print(classification_report(ytest, y_pred))
print('------------------------------------')

#[[225  14]
# [ 36  37]]


## The following is for tuning the model more 

# grid search solver for lda
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1)
# define model
model = LinearDiscriminantAnalysis()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


# grid search shrinkage for lda
from numpy import arange
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1)
# define model
model = LinearDiscriminantAnalysis(solver='lsqr')
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['shrinkage'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


Discriminative Analysis: 
[[147   6]
 [ 21  29]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       153
           1       0.83      0.58      0.68        50

    accuracy                           0.87       203
   macro avg       0.85      0.77      0.80       203
weighted avg       0.86      0.87      0.86       203

------------------------------------
Mean Accuracy: 0.893
Config: {'solver': 'svd'}
Mean Accuracy: 0.893
Config: {'shrinkage': 0.0}


In [13]:
## KNN


from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier(n_neighbors=8, p=1, weights='distance', algorithm='auto')
knn.fit(xtrain, ytrain)
y_pred = knn.predict(xtest)
print("K-nearest neighbour  ")
print(confusion_matrix(ytest, y_pred2))
print(classification_report(ytest, y_pred))
print('------------------------------------')


## For later ------------------------------------------------------------------------------------------
# Create a pipeline
#pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
#param_grid = [{
#    'kneighborsclassifier__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
#    'kneighborsclassifier__p': [1, 2],
#    'kneighborsclassifier__weights': ['uniform', 'distance'],
#    'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#}]

# Create a grid search instance
#gs = GridSearchCV(pipeline, param_grid = param_grid,
#                  scoring='accuracy',
#                  refit=True,
#                  cv=10,
#                  verbose=1,
#                  n_jobs=2)
#
# Fit the most optimal model
#gs.fit(X_train, y_train)
# Print the best model parameters and scores
#print('Best Score: %.3f' % gs.best_score_, '\nBest Parameters: ', gs.best_params_)
#print('Score: %.3f' % gs.score(X_test, y_test))


K-nearest neighbour  
[[141  12]
 [ 25  25]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       153
           1       0.79      0.38      0.51        50

    accuracy                           0.82       203
   macro avg       0.81      0.67      0.70       203
weighted avg       0.82      0.82      0.80       203

------------------------------------


In [1]:
## Neural Networks
from tensorflow import keras
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

NNmodel = Sequential()
#saved as model 6 - best one so far 
#NNmodel.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13)) 
#NNmodel.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu'))           

NNmodel.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu', input_dim = 16)) 
NNmodel.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu'))           
#NNmodel.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))           


NNmodel.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

opt = keras.optimizers.Adam(lr=0.01)
NNmodel.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
print("NN created ..")

NNmodel.fit(xtrain, ytrain, batch_size = 10, epochs = 70)
print("End of training ...")

y_NNresults = NNmodel.predict(xtest)
y_NNresults = (y_NNresults > 0.5)

from sklearn.metrics import confusion_matrix , classification_report
print(confusion_matrix(ytest, y_NNresults))
print(classification_report(ytest, y_NNresults))
print("End of testing ...")

ModuleNotFoundError: No module named 'tensorflow'

In [47]:
import os
cwd = os.getcwd()
os.chdir('D:\\SML\\')

         
# serialize model to JSON
model_json = NNmodel.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
NNmodel.save_weights("model.h5")
print("Saved model to disk")
 


Saved model to disk


In [None]:
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))