In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest, chi2, RFE

import pickle
import time 

In [2]:
#1.Loading Original Dataset : 
dataset=pd.read_csv("Pre-processed_CKD_Data.csv",index_col=None)
print(dataset.shape)
dataset.head()

(399, 25)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [3]:
#2.Duplicating the Original Dataset
dataset2 = dataset

#3.Classifying the Nominal Columns in Dataset : 
dataset2 = pd.get_dummies(dataset2, drop_first=True)
print(dataset2.shape)
dataset2.head()

(399, 28)


Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,1,0,0,0,0,0,1,0,0,1
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,1,0,0,0,0,0,1,0,0,1
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,1,0,0,0,0,0,1,0,1,1
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,1,0,0,0,0,0,1,0,0,1


In [4]:
#4.Assigning Variables (Independent/Dependent) : 

indep_X = dataset2.drop('classification_yes', 1)
print(indep_X.shape)

dep_Y = dataset2['classification_yes']
print(dep_Y.shape)

(399, 27)
(399,)


In [5]:
# Creating Function(s) :

def train_test_split_and_StandardScaler(indep_X,dep_Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, Y_train, Y_test

def RFE_Features_Classification(indep_X, dep_Y, n):
   
    RFE_List = []

    #logistic_Regression = LogisticRegression(solver='lbfgs')
    logistic_Regression = LogisticRegression(max_iter=1000, solver='lbfgs')
    svc_Linear = SVC(kernel = 'linear', random_state = 0)
    svc_NonLinear = SVC(kernel = 'rbf', random_state = 0)
    gaussianNB = GaussianNB()
    kNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    decisionTree = DecisionTreeClassifier(criterion = 'gini',max_features = 'sqrt',splitter = 'best', random_state = 0)
    randomForest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        
    RFE_Model_List = [logistic_Regression, svc_Linear, svc_NonLinear, gaussianNB, kNN, decisionTree, randomForest]

    for model in RFE_Model_List:
        print(model)
        #logistic_RFE = RFE(i, n)
        logistic_RFE = RFE(estimator = model, n_features_to_select=n)
        logistic_RFE_Fit = logistic_RFE.fit(indep_X, dep_Y)
        logistic_RFE_Feature = logistic_RFE.transform(indep_X)
        RFE_List.append(logistic_RFE_Feature)
    return RFE_List

RFE_List = RFE_Features_Classification(indep_X, dep_Y, 6)

def Confusion_Matrix(classifier, X_test, Y_test):
    y_pred = classifier.predict(X_test)

    from sklearn.metrics import confusion_matrix
    ConfusionMatrix = confusion_matrix(Y_test, y_pred)

    from sklearn.metrics import classification_report 
    ClassificationReport = classification_report(Y_test, y_pred)

    from sklearn.metrics import accuracy_score
    AccuracyScore=accuracy_score(Y_test, y_pred)         

    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore 

def Logistic_Regression(X_train,Y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    #classifier = LogisticRegression(random_state = 0) 
    classifier = LogisticRegression(solver='lbfgs', max_iter= 5000)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore   

def SVM_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def SVM_Non_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def Naive_Bayes(X_train,Y_train,X_test):       

    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def KNN(X_train,Y_train,X_test):

    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def DecisionTree(X_train,Y_train,X_test):

    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def RandomForest(X_train,Y_train,X_test):

    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, Y_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier,X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore


def RFE_Classification(accuracy_LogisticRegression, accuracy_SVM_Linear, accuracy_SVM_NonLinear, 
                           accuracy_KNN, accuracy_NaiveBayes, accuracy_DecisionTree, accuracy_RandomForest): 

    dataframe=pd.DataFrame(index=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'],
                           columns=['Logistic Regression','SVM Linear','SVM Non Linear','KNN','Naive Bayes',
                                    'Decision Tree','Random Forest'])
    
    #Function - enumerate() acts as a Counter which Iterates index starting from 0 (by default) and their item(s) from the iterable
    #Use enumerate() when We need both Position in the loop (number) and its value from the iterable (idex)
    
    for indexCount,indexValue in enumerate(dataframe.index):      
        dataframe['Logistic Regression'][indexValue]=accuracy_LogisticRegression[indexCount]       
        dataframe['SVM Linear'][indexValue]=accuracy_SVM_Linear[indexCount]
        dataframe['SVM Non Linear'][indexValue]=accuracy_SVM_NonLinear[indexCount]
        dataframe['KNN'][indexValue]=accuracy_KNN[indexCount]
        dataframe['Naive Bayes'][indexValue]=accuracy_NaiveBayes[indexCount]
        dataframe['Decision Tree'][indexValue]=accuracy_DecisionTree[indexCount]
        dataframe['Random Forest'][indexValue]=accuracy_RandomForest[indexCount]
    return dataframe

LogisticRegression(max_iter=1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


SVC(kernel='linear', random_state=0)
SVC(random_state=0)


ValueError: when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [None]:
#Creating Empty Lists
accuracy_LogisticRegression = []
accuracy_SVM_Linear = []
accuracy_SVM_NonLinear = []
accuracy_KNN = []
accuracy_NaiveBayes = []
accuracy_DecisionTree = []
accuracy_RandomForest = []

for i in RFE_List:
    
    #6.Calling a Created Function - train_test_split_and_StandardScaler: which returns - X_train, X_test, Y_train, Y_test
    #Hence, Passing (i, dep_Y) along with Selected Number of Features instead of K_Best (k_Best, dep_Y)
    X_train, X_test, Y_train, Y_test = train_test_split_and_StandardScaler(i, dep_Y)   

    #Creating Various Models as follows :

    #7.Calling a Created Function - LogisticRegression(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = LogisticRegression(X_train,Y_train,X_test)
    accuracy_LogisticRegression.append(accuracy_score)

    #8.Calling a Created Function - SVM_Linear(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = SVM_Linear(X_train,Y_train,X_test)  
    accuracy_SVM_Linear.append(accuracy_score)

    #9.Calling a Created Function - SVM_Non_Linear(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = SVM_Non_Linear(X_train,Y_train,X_test)  
    accuracy_SVM_NonLinear.append(accuracy_score)

    #10.Calling a Created Function - KNN(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = KNN(X_train,Y_train,X_test)  
    accuracy_KNN.append(accuracy_score)

    #11.Calling a Created Function - Naive_Bayes(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Naive_Bayes(X_train,Y_train,X_test)  
    accuracy_NaiveBayes.append(accuracy_score)

    #12.Calling a Created Function - DecisionTree(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = DecisionTree(X_train,Y_train,X_test)  
    accuracy_DecisionTree.append(accuracy_score)

    #13.Calling a Created Function - RandomForest(X_train,Y_train,X_test): which returns - classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = RandomForest(X_train,Y_train,X_test)  
    accuracy_RandomForest.append(accuracy_score)

#14.Calling a Created Function - RFE_Classification(With Below Parameters): which returns - dataframe    
result=RFE_Classification(accuracy_LogisticRegression, accuracy_SVM_Linear, accuracy_SVM_NonLinear, 
                               accuracy_KNN, accuracy_NaiveBayes, accuracy_DecisionTree, accuracy_RandomForest)

result

In [None]:
# Calling the Final Results by Selecting Top 6 Features
#result

In [None]:
# Calling the Final Results by Selecting Top 5 Features
#result

In [None]:
# Calling the Final Results by Selecting Top 4 Features
#result

In [None]:
# Calling the Final Results by Selecting Top 3 Features
#result

In [None]:
# Calling the Final Results by Selecting Top 2 Features
#result

In [None]:
# Calling the Final Results by Selecting Top 1 Feature
#result