In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from Univariate import Preprocess
from sklearn.preprocessing import LabelEncoder

In [2]:
def RFE_Alogrithm(X,y,n):
    RFE_list=[]
    log_model=LogisticRegression(solver='lbfgs',random_state=0)
    svm_model=SVC(kernel='linear',random_state=0)
    dt_model=DecisionTreeClassifier(criterion='gini',splitter='best')
    rf_model=RandomForestClassifier(n_estimators=10,criterion='gini')
    models_list=[log_model,svm_model,dt_model,rf_model]
    for i in models_list:
        print(i)
        RFE_alg=RFE(i,n_features_to_select=n)
        fit1=RFE_alg.fit(X,y)
        trans=fit1.transform(X)
        RFE_list.append(trans)
    return RFE_list

In [3]:
def traintestsplit(X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,y_train,y_test

In [4]:
def Classification_Test_Report(classifier,X_test,y_test):
    y_pred=classifier.predict(X_test)
    CM=confusion_matrix(y_test,y_pred)
    CR=classification_report(y_test,y_pred)
    Accuracy=accuracy_score(y_test,y_pred)
    return CM,CR,Accuracy

In [5]:
def LR(X_train,X_test,y_train,y_test):
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,y_train)
    CM,CR,Accuracy=Classification_Test_Report(classifier,X_test,y_test)
    return CM,CR,Accuracy
    

In [6]:
def KNN(X_train,X_test,y_train,y_test):
    classifier=KNeighborsClassifier()
    classifier.fit(X_train,y_train)
    CM,CR,Accuracy=Classification_Test_Report(classifier,X_test,y_test)
    return CM,CR,Accuracy

In [7]:
def SVM(X_train,X_test,y_train,y_test):
    classifier=SVC()
    classifier.fit(X_train,y_train)
    CM,CR,Accuracy=Classification_Test_Report(classifier,X_test,y_test)
    return CM,CR,Accuracy

In [8]:
def DT(X_train,X_test,y_train,y_test):
    classifier=DecisionTreeClassifier()
    classifier.fit(X_train,y_train)
    CM,CR,Accuracy=Classification_Test_Report(classifier,X_test,y_test)
    return CM,CR,Accuracy

In [9]:
def RF(X_train,X_test,y_train,y_test):
    classifier=RandomForestClassifier()
    classifier.fit(X_train,y_train)
    CM,CR,Accuracy=Classification_Test_Report(classifier,X_test,y_test)
    return CM,CR,Accuracy

In [10]:
df=pd.read_csv("pre-kidney_disease.csv")

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,0,0.0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,...,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,1,1.0,7.0,50.0,1.02,4.0,0.0,99.0,18.0,0.8,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,2,2.0,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,...,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,3,3.0,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,...,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,4,4.0,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd


In [12]:
quan,qual=Preprocess.QuanQual(df)

In [13]:
quan

['Unnamed: 0',
 'id',
 'age',
 'bp',
 'sg',
 'al',
 'su',
 'bgr',
 'bu',
 'sc',
 'sod',
 'pot',
 'hemo',
 'pcv',
 'wc',
 'rc']

In [14]:
df=df.drop('Unnamed: 0',axis=1)

In [15]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,0.0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,135.0,...,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,1.0,7.0,50.0,1.02,4.0,0.0,99.0,18.0,0.8,135.0,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,2.0,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,135.0,...,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,3.0,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,4.0,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,135.0,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd


In [16]:
l=LabelEncoder()
for i in qual:
    df[i]=l.fit_transform(df[i])

In [17]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
0,0.0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,135.0,...,1,0,0,1,4,1,0,0,0,0
1,1.0,7.0,50.0,1.02,4.0,0.0,99.0,18.0,0.8,135.0,...,1,0,0,0,3,1,0,0,0,0
2,2.0,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,135.0,...,1,0,0,0,4,1,1,0,1,0
3,3.0,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,0,1,0,1,3,1,1,1,1,0
4,4.0,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,135.0,...,1,0,0,0,3,1,0,0,0,0


In [18]:
X=df.drop("classification",axis=1)
y=df["classification"]

In [19]:
RFE_use=RFE_Alogrithm(X,y,5)
RFE_use

LogisticRegression(random_state=0)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

SVC(kernel='linear', random_state=0)
DecisionTreeClassifier()
RandomForestClassifier(n_estimators=10)


[array([[1. , 1.2, 1. , 0. , 0. ],
        [4. , 0.8, 0. , 0. , 0. ],
        [2. , 1.8, 0. , 1. , 0. ],
        ...,
        [0. , 0.6, 0. , 0. , 0. ],
        [0. , 1. , 0. , 0. , 0. ],
        [0. , 1.1, 0. , 0. , 0. ]]),
 array([[  0. ,  15.4,   1. ,   0. ,   0. ],
        [  1. ,  11.3,   0. ,   0. ,   0. ],
        [  2. ,   9.6,   0. ,   1. ,   1. ],
        ...,
        [397. ,  15.8,   0. ,   0. ,   0. ],
        [398. ,  14.2,   0. ,   0. ,   0. ],
        [399. ,  15.8,   0. ,   0. ,   0. ]]),
 array([[  0. ,  36. ,   5.2,   4. ,   1. ],
        [  1. ,  18. ,   5.2,   3. ,   1. ],
        [  2. ,  53. ,   5.2,   4. ,   1. ],
        ...,
        [397. ,  26. ,   5.4,   3. ,   1. ],
        [398. ,  50. ,   5.9,   3. ,   1. ],
        [399. ,  18. ,   6.1,   3. ,   1. ]]),
 array([[  0.   ,   1.02 ,   1.2  ,  15.4  ,  44.   ],
        [  1.   ,   1.02 ,   0.8  ,  11.3  ,  38.   ],
        [  2.   ,   1.01 ,   1.8  ,   9.6  ,  31.   ],
        ...,
        [397.   ,   1.02 , 

In [20]:
log=[]
knn=[]
svm=[]
dt=[]
rf=[]

In [21]:
for i in RFE_use:
    X_train,X_test,y_train,y_test=traintestsplit(i,y)
    
    CM,CR,Accuracy=LR(X_train,X_test,y_train,y_test)
    log.append(Accuracy)
    
    CM,CR,Accuracy=KNN(X_train,X_test,y_train,y_test)
    knn.append(Accuracy)
    
    CM,CR,Accuracy=SVM(X_train,X_test,y_train,y_test)
    svm.append(Accuracy)
    
    CM,CR,Accuracy=DT(X_train,X_test,y_train,y_test)
    dt.append(Accuracy)
    
    CM,CR,Accuracy=RF(X_train,X_test,y_train,y_test)
    rf.append(Accuracy)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
data=pd.DataFrame(columns=["Logistic_Reg","K_Neighbor","Suport_V_M","DecisionTree","RandomForest"],
                 index=["LR","SVM","DT","RF"])
for i,j, in enumerate(data.index):
    data["Logistic_Reg"][j]=log[i]
    data["K_Neighbor"][j]=knn[i]
    data["Suport_V_M"][j]=svm[i]
    data["DecisionTree"][j]=dt[i]
    data["RandomForest"][j]=rf[i]

In [26]:
data

Unnamed: 0,Logistic_Reg,K_Neighbor,Suport_V_M,DecisionTree,RandomForest
LR,0.95,0.958333,0.95,0.958333,0.958333
SVM,0.991667,0.983333,0.991667,0.975,0.991667
DT,0.975,0.983333,0.975,0.975,0.991667
RF,0.983333,0.983333,0.983333,0.975,0.991667
