In [60]:
import pandas as pd
import numpy as np
import time 
import warnings
warnings.filterwarnings('ignore')

In [61]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
import matplotlib.pyplot as plt

In [62]:
def rfefeature(indep_X, dep_Y, n):
    rfelist = []
    log_model = LogisticRegression(solver = 'lbfgs')
    RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    SVC_model = SVC(kernel = 'linear', random_state = 0)
    #NB = GaussianNB()
    #knn = KNeighborsClassifier()
    DT = DecisionTreeClassifier(criterion = 'gini', max_features = 'sqrt', splitter = 'best', random_state = 0)
    rfemodellist = [log_model, SVC_model, DT, RF]
    for i in rfemodellist:
        print(i)
        log_rfe = RFE(i,n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_transform = log_fit.transform(indep_X)
        rfelist.append(log_rfe_transform)
    return rfelist

In [63]:
def split_scaler(indep_X, dep_Y):
    X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size = 0.3, random_state = 0) 
    # Feature Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, Y_train, Y_test

In [64]:
def cm_prediction(classifier, X_test):
    Y_pred = classifier.predict(X_test)
    # making Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test, Y_pred)
    from sklearn.metrics import accuracy_score
    Accuracy = accuracy_score(Y_test, Y_pred)
    from sklearn.metrics import classification_report
    report = classification_report(Y_test, Y_pred)
    return cm, Accuracy, report, X_test, Y_test, classifier    

In [65]:
def logistic(X_train, Y_train, X_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm 

In [66]:
def svm_linear(X_train, Y_train, X_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test) 
    return  classifier,Accuracy,report,X_test,Y_test,cm    

In [67]:
def svm_nl(X_train, Y_train, X_test):
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test) 
    return  classifier,Accuracy,report,X_test,Y_test,cm    

In [68]:
def knn(X_train, Y_train, X_test):
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test)
    return  classifier,Accuracy,report,X_test,Y_test,cm 

In [69]:
def naive(X_train, Y_train, X_test):
    classifier = GaussianNB()
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test) 
    return  classifier,Accuracy,report,X_test,Y_test,cm    

In [70]:
def decision(X_train, Y_train, X_test):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test) 
    return  classifier,Accuracy,report,X_test,Y_test,cm    

In [71]:
def random(X_train, Y_train, X_test):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test) 
    return  classifier,Accuracy,report,X_test,Y_test,cm  

In [72]:
def rfe_classification(acclog, accsvml, accsvmnl, accknn, accnaive, accdeci, accrand):
    rfedataframe = pd.DataFrame(index = ['Logistic', 'SVM', 'Decision','Random'], 
                             columns = ['Logistic', 'SVML', 'SVMNL', 'KNN', 'Naive', 'Decision', 'Random'])
    for number, item in enumerate(rfedataframe.index):
        rfedataframe['Logistic'][item] = acclog[number]
        rfedataframe['SVML'][item] = accsvml[number]
        rfedataframe['SVMNL'][item] = accsvmnl[number]
        rfedataframe['KNN'][item] = accknn[number]
        rfedataframe['Naive'][item] = accnaive[number]
        rfedataframe['Decision'][item] = accdeci[number]
        rfedataframe['Random'][item] = accrand[number]
    return rfedataframe        

In [73]:
dataset = pd.read_csv('prep.csv',index_col = None)
df1 = dataset

In [74]:
df1 = pd.get_dummies(df1, dtype = int, drop_first = True)
df1

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [75]:
indep_X = df1.drop('classification_yes', axis = 1)
dep_Y = df1['classification_yes']

In [88]:
rfelist = rfefeature(indep_X, dep_Y,7)
acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnaive = []
accdeci = []
accrand = []

LogisticRegression()
SVC(kernel='linear', random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)


In [89]:
for i in rfelist:
    X_train, X_test, Y_train, Y_test = split_scaler(i, dep_Y)
        
    classifier,Accuracy,report,X_test,Y_test,cm=logistic(X_train,Y_train,X_test)
    acclog.append(Accuracy)

    classifier, Accuracy, report, X_test, Y_test, cm = svm_linear(X_train, Y_train, X_test)
    accsvml.append(Accuracy)

    classifier, Accuracy, report, X_test, Y_test, cm = svm_nl(X_train, Y_train, X_test)
    accsvmnl.append(Accuracy)

    classifier, Accuracy, report, X_test, Y_test, cm = naive(X_train, Y_train, X_test)
    accnaive.append(Accuracy)

    classifier, Accuracy, report, X_test, Y_test, cm = knn(X_train, Y_train, X_test)
    accknn.append(Accuracy)

    classifier, Accuracy, report, X_test, Y_test, cm = decision(X_train, Y_train, X_test)
    accdeci.append(Accuracy)

    classifier, Accuracy, report, X_test, Y_test, cm = random(X_train, Y_train, X_test)
    accrand.append(Accuracy)

result = rfe_classification(acclog,accsvml,accsvmnl,accnaive,accknn,accdeci,accrand)

In [78]:
# For K=3
#result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random
Logistic,0.941667,0.941667,0.941667,0.941667,0.941667,0.941667,0.941667
SVM,0.875,0.875,0.875,0.875,0.875,0.875,0.875
Decision,0.975,0.975,0.975,0.8,0.975,0.966667,0.966667
Random,0.941667,0.941667,0.941667,0.9,0.941667,0.908333,0.933333


In [81]:
# For K=4
#result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random
Logistic,0.95,0.95,0.95,0.95,0.95,0.95,0.95
SVM,0.958333,0.958333,0.958333,0.958333,0.958333,0.958333,0.958333
Decision,0.975,0.916667,0.916667,0.816667,0.975,0.975,0.975
Random,0.975,0.975,0.975,0.875,0.975,0.958333,0.966667


In [84]:
# For K=5
# result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random
Logistic,0.975,0.975,0.975,0.975,0.975,0.975,0.975
SVM,0.983333,0.983333,0.983333,0.983333,0.983333,0.983333,0.983333
Decision,0.95,0.983333,0.933333,0.858333,0.95,0.975,0.966667
Random,0.966667,0.966667,0.983333,0.916667,0.975,0.95,0.975


In [87]:
# For K=6
#result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random
Logistic,0.975,0.958333,0.975,0.975,0.975,0.975,0.966667
SVM,0.983333,0.983333,0.983333,0.958333,0.983333,0.983333,0.983333
Decision,0.966667,0.966667,0.975,0.858333,0.966667,0.975,0.966667
Random,0.975,0.983333,0.991667,0.925,0.966667,0.941667,0.975


In [90]:
# For K=7
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random
Logistic,0.975,0.958333,0.975,0.975,0.958333,0.975,0.975
SVM,0.983333,0.975,0.983333,0.958333,0.983333,0.983333,0.983333
Decision,0.958333,0.966667,0.966667,0.891667,0.933333,0.975,0.958333
Random,0.983333,0.983333,0.991667,0.925,0.983333,0.95,0.966667
