In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
#rfe
def rfefeature(indep_x,dep_y,n):
    rfelist=[]
    log_model=LogisticRegression(solver='liblinear',max_iter=5000,random_state=0)
    RF=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    DT=DecisionTreeClassifier(criterion='gini', max_features='sqrt',splitter='best',random_state=0)
    svc_model=SVC(kernel='linear', random_state=0)
    rfemodellist=[log_model,svc_model,RF,DT]
    for i in rfemodellist:
        print(i)
        log_rfe=RFE(estimator=i,n_features_to_select=n)
        log_fit=log_rfe.fit(indep_x,dep_y)
        log_rfe_feature=log_fit.transform(indep_x)
        rfelist.append(log_rfe_feature)
    return rfelist

In [3]:
#standardscaler
def split_scaler(indep_x,dep_y):
    x_train,x_test,y_train,y_test=train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

In [4]:
#confusion matrix prediction
def cm_prediction(classifier,x_test):
    y_pred=classifier.predict(x_test)

    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    return classifier, Accuracy, report, x_test, y_test, cm

In [5]:
#Logistic Regression
def logistic(x_train,y_train,x_test):
    classifier=LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [6]:
#SVM linear
def svml(x_train,y_train,x_test):
    classifier=SVC(kernel='linear',random_state=0)
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [7]:
#SVM nonlinear
def svmnl(x_train,y_train,x_test):
    classifier=SVC(kernel='rbf',random_state=0)
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [8]:
#Navie Bayes
def navie(x_train,y_train,x_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [9]:
#KNN
def knn(x_train,y_train,x_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5, metric='minkowski',p=2)
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [10]:
#Decision tree
def decision(x_train,y_train,x_test):
    classifier=DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [11]:
#Random forest
def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy', random_state=0)
    classifier.fit(x_train,y_train)
    classfier, Accuracy, report, x_test, y_test, cm=cm_prediction(classifier,x_test)
    return classifier, Accuracy, report, x_test, y_test, cm

In [21]:
#RFE Classification
def rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf):
    df=pd.DataFrame(index=['Logistic Regression', 'SVC', 'Random Forest','Decision Tree'],columns=['Logistic Regression','SVM Linear','SVM NonLinear','KNN','Navie Bayes','Decision Tree','Random Forest'])
    for number,idex in enumerate(df.index):
        df.loc[idex,'Logistic Regression']=acclog[number]
        df.loc[idex,'SVM Linear']=accsvml[number]
        df.loc[idex,'SVM NonLinear']=accsvmnl[number]
        df.loc[idex,'KNN']=accknn[number]
        df.loc[idex,'Navie Bayes']=accnav[number]
        df.loc[idex,'Decision Tree']=accdes[number]
        df.loc[idex,'Random Forest']=accrf[number]
    return df

In [13]:
dataset=pd.read_csv("prep.csv",index_col=None)
df2=dataset
df2

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [14]:
df2=pd.get_dummies(df2,drop_first=True, dtype=int)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [15]:
indep_x=df2.drop(columns='classification_yes',axis=1)
dep_y=df2['classification_yes']


In [32]:
rfelist=rfefeature(indep_x,dep_y,5)
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]



LogisticRegression(max_iter=5000, random_state=0, solver='liblinear')
SVC(kernel='linear', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)


In [33]:
for i in rfelist:
    x_train,x_test,y_train,y_test=split_scaler(i,dep_y)
    classifier, Accuracy, report, x_test, y_test, cm=logistic(x_train,y_train,x_test)
    acclog.append(Accuracy)
    classifier, Accuracy, report, x_test, y_test, cm=svml(x_train,y_train,x_test)
    accsvml.append(Accuracy)
    classifier, Accuracy, report, x_test, y_test, cm=svmnl(x_train,y_train,x_test)
    accsvmnl.append(Accuracy)
    classifier, Accuracy, report, x_test, y_test, cm=knn(x_train,y_train,x_test)
    accknn.append(Accuracy)
    classifier, Accuracy, report, x_test, y_test, cm=navie(x_train,y_train,x_test)
    accnav.append(Accuracy)
    classifier, Accuracy, report, x_test, y_test, cm=decision(x_train,y_train,x_test)
    accdes.append(Accuracy)
    classifier, Accuracy, report, x_test, y_test, cm=random(x_train,y_train,x_test)
    accrf.append(Accuracy)

In [34]:
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

In [23]:
#k=3
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM NonLinear,KNN,Navie Bayes,Decision Tree,Random Forest
Logistic Regression,0.94,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.87,0.87,0.87,0.87,0.87,0.87,0.87
Random Forest,0.91,0.92,0.93,0.93,0.86,0.91,0.94
Decision Tree,0.93,0.93,0.94,0.95,0.74,0.95,0.97


In [27]:
#k=4
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM NonLinear,KNN,Navie Bayes,Decision Tree,Random Forest
Logistic Regression,0.95,0.95,0.95,0.95,0.95,0.95,0.95
SVC,0.96,0.96,0.96,0.96,0.96,0.96,0.96
Random Forest,0.93,0.93,0.94,0.93,0.91,0.91,0.94
Decision Tree,0.97,0.97,0.97,0.96,0.84,0.96,0.96


In [31]:
#k=5
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM NonLinear,KNN,Navie Bayes,Decision Tree,Random Forest
Logistic Regression,0.98,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.99,0.99,0.99,0.99,0.99,0.99,0.99
Random Forest,0.97,0.97,0.97,0.96,0.87,0.93,0.97
Decision Tree,0.97,0.98,0.98,0.98,0.91,0.96,0.98


In [35]:
#k=6
result

Unnamed: 0,Logistic Regression,SVM Linear,SVM NonLinear,KNN,Navie Bayes,Decision Tree,Random Forest
Logistic Regression,0.98,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.99,0.99,0.99,0.99,0.99,0.99,0.99
Random Forest,0.97,0.97,0.97,0.96,0.87,0.93,0.97
Decision Tree,0.97,0.98,0.98,0.98,0.91,0.96,0.98
