In [1000]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import random

In [1001]:
#Splits the data into 3 parts: train,test,validation 
def split_dataset_to_train_test_validation(df_copy, train_sze, test_sze, validation_sze):
    random.seed(0)
    X = df_copy #Contains all columns
    Y = df_copy[['class']]
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=train_sze, random_state=50)
    X_test, X_validation, Y_test, Y_validation = train_test_split(X_temp, Y_temp, train_size=test_sze/(test_sze+validation_sze), random_state=50)
    return X_train, X_test, X_validation

#Normalize the data
def normalize(dataframe, normalize_columns):
    result = dataframe.copy()
    for feature_name in dataframe[normalize_columns].columns:
        max_value = dataframe[feature_name].max()
        min_value = dataframe[feature_name].min()
        result[feature_name] = (dataframe[feature_name] - min_value) / (max_value - min_value)
    return result

#Standardize the data
def standardize(dataframe, normalize_columns):
    result = dataframe.copy()
    for feature_name in dataframe[normalize_columns].columns:
        std_value = dataframe[feature_name].std()
        mean_value = dataframe[feature_name].mean()
        result[feature_name] = (dataframe[feature_name] - mean_value) / std_value
    return result

In [1002]:
warnings.filterwarnings('ignore')
df = pandas.read_csv('BankNoteAuthentication.csv')
print('Numbers of NaN\'s in Dataframe: ' + str(df.isnull().sum().sum()))
df.dropna()
df.describe().transpose()

Numbers of NaN's in Dataframe: 0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
variance,1372.0,0.433735,2.842763,-7.0421,-1.773,0.49618,2.821475,6.8248
skewness,1372.0,1.922353,5.869047,-13.7731,-1.7082,2.31965,6.814625,12.9516
curtosis,1372.0,1.397627,4.31003,-5.2861,-1.574975,0.61663,3.17925,17.9274
entropy,1372.0,-1.191657,2.101013,-8.5482,-2.41345,-0.58665,0.39481,2.4495
class,1372.0,0.444606,0.497103,0.0,0.0,0.0,1.0,1.0


In [1003]:
target_column = ['class']
predictors = list(set(list(df.columns))-set(target_column)) #Kick the class column
df = normalize(df, predictors) #Normalize the columns except the class column
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
variance,1372.0,0.539114,0.205003,0.0,0.379977,0.543617,0.711304,1.0
skewness,1372.0,0.587301,0.219611,0.0,0.451451,0.602168,0.770363,1.0
curtosis,1372.0,0.287924,0.185669,0.0,0.159869,0.25428,0.364674,1.0
entropy,1372.0,0.668917,0.191041,0.0,0.557821,0.723929,0.813171,1.0
class,1372.0,0.444606,0.497103,0.0,0.0,0.0,1.0,1.0


In [1009]:
#df[['variance', 'skewness', 'curtosis', 'entropy']].corr()[:][:] #Shows correlation between feautures in a table
print('Correlation between feautures: ')
corr = df.corr()
corr.style.background_gradient(cmap='Blues')

Correlation between feautures: 


Unnamed: 0,variance,skewness,curtosis,entropy,class
variance,1.0,0.264026,-0.38085,0.276817,-0.724843
skewness,0.264026,1.0,-0.786895,-0.526321,-0.444688
curtosis,-0.38085,-0.786895,1.0,0.318841,0.155883
entropy,0.276817,-0.526321,0.318841,1.0,-0.023424
class,-0.724843,-0.444688,0.155883,-0.023424,1.0


In [1005]:
train_set, test_set, validation_set = split_dataset_to_train_test_validation(df, 0.75, 0.15, 0.1)

X_train = train_set[predictors]
Y_train = train_set[['class']]

X_test = test_set[predictors]
Y_test = test_set[['class']]

X_validation = validation_set[predictors]
Y_validation = validation_set[['class']]

In [1006]:
lr = LogisticRegressionCV(Cs=10, cv=10, solver='saga', max_iter=50).fit(X_train,Y_train)

In [1007]:
predict_train = lr.predict(X_train)
predict_test = lr.predict(X_test)
predict_validation = lr.predict(X_validation)

In [1008]:
print('Confusion Matrix of Train Set:'); print(confusion_matrix(Y_train, predict_train))
print('Classification Report of Train Set:'); print(classification_report(Y_train, predict_train))

print('Confusion Matrix of Test Set:'); print(confusion_matrix(Y_test, predict_test))
print('Classification Report of Test Set:'); print(classification_report(Y_test, predict_test))

print('Confusion Matrix of Validation Set:'); print(confusion_matrix(Y_validation, predict_validation))
print('Classification Report of Validation Set:'); print(classification_report(Y_validation, predict_validation))

Confusion Matrix of Train Set:
[[556   8]
 [  4 461]]
Classification Report of Train Set:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       564
           1       0.98      0.99      0.99       465

    accuracy                           0.99      1029
   macro avg       0.99      0.99      0.99      1029
weighted avg       0.99      0.99      0.99      1029

Confusion Matrix of Test Set:
[[118   2]
 [  0  85]]
Classification Report of Test Set:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       120
           1       0.98      1.00      0.99        85

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205

Confusion Matrix of Validation Set:
[[78  0]
 [ 0 60]]
Classification Report of Validation Set:
              precision    recall  f1-score   support

           0     