In [1]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import pickle
import matplotlib.pyplot as plt

In [3]:
def split_scaler(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train, x_test, y_train, y_test 

In [4]:
def pca(x_train, x_test,n):
    pca = PCA(n_components = n)
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.transform(x_test)
    explained_variance = pca.explained_variance_ratio_
    return x_train_pca, x_test_pca, explained_variance

In [5]:
def cm_prediction(classifier, x_test_pca):
    from sklearn.linear_model import LogisticRegression
   
    y_pred = classifier.predict(x_test_pca)
    # Making Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_pred)
    
    from sklearn.metrics import classification_report
    report = classification_report(y_test, y_pred)
    return classifier, accuracy, report, x_test_pca, y_test, cm

In [6]:
def logistic(x_train_pca,y_train,x_test_pca):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train_pca, y_train)
    classifier,accuracy,report,x_test_pca,y_test,cm = cm_prediction(classifier,x_test_pca)
    return  classifier,accuracy,report,x_test_pca,y_test,cm   

In [7]:
def svm_linear(x_train_pca, y_train, x_test_pca):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(x_train_pca, y_train)
    classifier, accuracy, report, x_test_pca, y_test, cm = cm_prediction(classifier, x_test_pca) 
    return  classifier,accuracy,report,x_test_pca,y_test,cm    

In [8]:
def svm_nl(x_train_pca, y_train, x_test_pca):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(x_train_pca, y_train)
    classifier, accuracy, report, x_test_pca, y_test, cm = cm_prediction(classifier, x_test_pca) 
    return  classifier,accuracy,report,x_test_pca,y_test,cm    

In [9]:
def knn(x_train_pca, y_train, x_test_pca):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(x_train_pca, y_train)
    classifier, accuracy, report, x_test_pca, y_test, cm = cm_prediction(classifier, x_test_pca)
    return  classifier,accuracy,report,x_test_pca,y_test,cm   

In [10]:
def naive(x_train_pca, y_train, x_test_pca):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(x_train_pca, y_train)
    classifier, accuracy, report, x_test_pca, y_test, cm = cm_prediction(classifier, x_test_pca)
    return  classifier,accuracy,report,x_test_pca,y_test,cm   

In [11]:
def decision(x_train_pca, y_train, x_test_pca):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(x_train_pca, y_train)
    classifier, accuracy, report, x_test_pca, y_test, cm = cm_prediction(classifier, x_test_pca) 
    return  classifier,accuracy,report,x_test_pca,y_test,cm   

In [12]:
def random(x_train_pca, y_train, x_test_pca):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(x_train_pca, y_train)
    classifier, accuracy, report, x_test_pca, y_test, cm = cm_prediction(classifier, x_test_pca) 
    return  classifier,accuracy,report,x_test_pca,y_test,cm   

In [13]:
def pca_classification(acclog, accsvml, accsvmnl, accknn, accnaive, accdeci, accrand):
    dataframe = pd.DataFrame(index = ['PCA'], columns = ['Logistic', 'SVML', 'SVMNL', 'KNN', 'Naive', 'Decision', 'Random'])
    for number, item in enumerate(dataframe.index):       # Enumerate - it is a counter that adds index to each items in an iterable
        dataframe['Logistic'][item] = acclog[number]
        dataframe['SVML'][item] = accsvml[number]
        dataframe['SVMNL'][item] = accsvmnl[number]
        dataframe['KNN'][item] = accknn[number]
        dataframe['Naive'][item] = accnaive[number]
        dataframe['Decision'][item] = accdeci[number]
        dataframe['Random'][item] = accrand[number]
    return dataframe        

In [14]:
dataset = pd.read_csv('prep.csv', index_col = None)  # index_col - keep the default index not allowing the input to reset the index position
dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [15]:
df1 = dataset
df1 = pd.get_dummies(df1, dtype = int, drop_first = True)

In [16]:
df1

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [17]:
x = df1.iloc[:,0:28].values
y = df1.iloc[:,27].values

In [18]:
acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnaive = []
accdeci = []
accrand = []

In [19]:
x_train, x_test, y_train, y_test = split_scaler(x, y)

In [20]:
x_train_pca, x_test_pca, explained_variance = pca(x_train, x_test, 1)

classifier, accuracy, report, x_test_pca, y_test,cm = logistic(x_train_pca,y_train,x_test_pca)
acclog.append(accuracy)

classifier, accuracy, report, x_test_pca, y_test, cm = svm_linear(x_train_pca, y_train, x_test_pca)
accsvml.append(accuracy)

classifier, accuracy, report, x_test_pca, y_test, cm = svm_nl(x_train_pca, y_train, x_test_pca)
accsvmnl.append(accuracy)

classifier, accuracy, report, x_test_pca, y_test, cm = knn(x_train_pca, y_train, x_test_pca)
accknn.append(accuracy)

classifier, accuracy, report, x_test_pca, y_test, cm = naive(x_train_pca, y_train, x_test_pca)
accnaive.append(accuracy)

classifier, accuracy, report, x_test_pca, y_test, cm = decision(x_train_pca, y_train, x_test_pca)
accdeci.append(accuracy)

classifier, accuracy, report, x_test_pca, y_test, cm = random(x_train_pca, y_train, x_test_pca)
accrand.append(accuracy)

In [21]:
result = pca_classification(acclog, accsvml, accsvmnl, accknn, accnaive, accdeci, accrand)

In [22]:

result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random
PCA,1.0,1.0,1.0,1.0,1.0,1.0,1.0
