In [1]:
# ✅ Importing Libraries :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

import pickle
import time 

In [2]:
# ✅ 1.Loading Original Dataset : 
dataset=pd.read_csv("Pre-processed_CKD_Data.csv",index_col=None)
print(dataset.shape)
dataset.head()

(399, 25)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [3]:
#✅ 2.Duplicating the Original Dataset
dataset2 = dataset

#✅ 3.Classifying the Nominal Columns in Dataset : 
dataset2 = pd.get_dummies(dataset2, drop_first=True)
print(dataset2.shape)
dataset2.head()

(399, 28)


Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,True,False,False,False,False,False,True,False,False,True
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,True,False,False,False,False,False,True,False,False,True
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,True,False,False,False,False,False,True,False,True,True
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,True,False,False,False,False,False,True,False,False,True


In [4]:
#✅ 4.Assigning Variables (Independent/Dependent) : 

indep_X = dataset2.drop('classification_yes', axis=1)
print(indep_X.shape)

dep_Y = dataset2['classification_yes']
print(dep_Y.shape)

(399, 27)
(399,)


In [5]:
#✅ 5.Creating Function(s) :

def train_test_split_and_StandardScaler(indep_X,dep_Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, Y_train, Y_test

def PCA_Features_Classification(X_train, X_test, Number_Of_Components):
    pca = PCA(n_components = Number_Of_Components)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)    
    Explained_Variance_Ratio = pca.explained_variance_ratio_    
    return X_train

def Confusion_Matrix(classifier, X_test):
    y_pred = classifier.predict(X_test)

    from sklearn.metrics import confusion_matrix
    ConfusionMatrix = confusion_matrix(Y_test, y_pred)

    from sklearn.metrics import classification_report 
    ClassificationReport = classification_report(Y_test, y_pred)

    from sklearn.metrics import accuracy_score
    AccuracyScore=accuracy_score(Y_test, y_pred)         

    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore 

def Logistic_Regression(X_train,Y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    #classifier = LogisticRegression(random_state = 0) 
    classifier = LogisticRegression(solver='lbfgs', max_iter= 10000)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore   

def SVM_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def SVM_Non_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def Naive_Bayes(X_train,Y_train,X_test):       

    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def KNN(X_train,Y_train,X_test):

    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def DecisionTree(X_train,Y_train,X_test):

    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def RandomForest(X_train,Y_train,X_test):

    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, Y_train)

    # Calling a Created Function - Confusion_Matrix(classifier,X_test) which returns - classifier, X_test, ConfusionMatrix, report, accuracy_score
    classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore = Confusion_Matrix(classifier, X_test)
    return classifier, X_test, Y_test, ConfusionMatrix, ClassificationReport, AccuracyScore

def PCA_Classification(Number_Of_Components, accuracy_LogisticRegression, accuracy_SVM_Linear, accuracy_SVM_NonLinear, 
                           accuracy_KNN, accuracy_NaiveBayes, accuracy_DecisionTree, accuracy_RandomForest): 
    
    # Create DataFrame with a numeric index
    dataframe = pd.DataFrame(index=[Number_Of_Components],
                             columns=['Logistic Regression', 'SVM Linear', 'SVM Non Linear','KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest'])
   
    # Set a name for the index column
    dataframe.index.name = 'Number of PCA Components'
    
    #Function - enumerate() acts as a Counter which Iterates index starting from 0 (by default) and their item(s) from the iterable
    #Use enumerate() when We need both Position in the loop (number) and its value from the iterable (idex)
    
    for indexCount,indexValue in enumerate(dataframe.index):      
        dataframe.loc[indexValue, 'Logistic Regression'] = accuracy_LogisticRegression[indexCount]       
        dataframe.loc[indexValue, 'SVM Linear'] = accuracy_SVM_Linear[indexCount]
        dataframe.loc[indexValue, 'SVM Non Linear'] = accuracy_SVM_NonLinear[indexCount]
        dataframe.loc[indexValue, 'KNN'] = accuracy_KNN[indexCount]
        dataframe.loc[indexValue, 'Naive Bayes'] = accuracy_NaiveBayes[indexCount]
        dataframe.loc[indexValue, 'Decision Tree'] = accuracy_DecisionTree[indexCount]
        dataframe.loc[indexValue, 'Random Forest'] = accuracy_RandomForest[indexCount]
    return dataframe

In [6]:
#✅ 6.Creating Empty Lists
accuracy_LogisticRegression = []
accuracy_SVM_Linear = []
accuracy_SVM_NonLinear = []
accuracy_KNN = []
accuracy_NaiveBayes = []
accuracy_DecisionTree = []
accuracy_RandomForest = []

In [7]:
#✅ 7.Calling a Created Function - train_test_split_and_StandardScaler: (01st Time to Pass X_train, X_test to Function - PCA_Features_Classification)
X_train, X_test, Y_train, Y_test = train_test_split_and_StandardScaler(indep_X, dep_Y)

In [8]:
Number_Of_Components = 6

#✅ 8.Calling a Created Function - PCA_Features_Classification(With Below Parameters): which returns - PCA_X_train
PCA_X_train = PCA_Features_Classification(X_train, X_test, Number_Of_Components)
PCA_X_train

array([[ 0.25128676,  0.37856611,  1.04857645,  0.49305423,  0.73411699,
        -0.86044099],
       [ 1.33765785,  0.50472567, -0.76069328, -0.41795485,  0.50012094,
        -0.05512791],
       [ 2.57133641,  0.22793673,  0.16633715,  0.06674013, -0.83886036,
        -0.08568646],
       ...,
       [ 0.44615013, -0.46194954, -1.40340525, -0.77930438,  0.16864546,
         0.0521787 ],
       [ 2.70323719,  0.01339849, -0.23856626,  0.35622769,  0.2581307 ,
        -0.04835053],
       [ 2.70561725, -0.13090682,  0.12676888,  0.42533048,  0.0144668 ,
         0.09058553]])

In [9]:
#✅ 9.Calling a Created Function - train_test_split_and_StandardScaler: (02nd Time to Pass PCA_X_train, Y_train for Below Models)
X_train, X_test, Y_train, Y_test = train_test_split_and_StandardScaler(PCA_X_train, Y_train)

In [10]:
#✅ 10.Appending the Accuracy Score of All the Models in Created Empty List

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Logistic_Regression(X_train,Y_train,X_test)
accuracy_LogisticRegression.append(accuracy_score)

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = SVM_Linear(X_train,Y_train,X_test)  
accuracy_SVM_Linear.append(accuracy_score)

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = SVM_Non_Linear(X_train,Y_train,X_test)  
accuracy_SVM_NonLinear.append(accuracy_score)

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = KNN(X_train,Y_train,X_test)  
accuracy_KNN.append(accuracy_score)

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = Naive_Bayes(X_train,Y_train,X_test)  
accuracy_NaiveBayes.append(accuracy_score)

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = DecisionTree(X_train,Y_train,X_test)  
accuracy_DecisionTree.append(accuracy_score)

classifier, X_test, Y_test, confusion_matrix, classification_report, accuracy_score = RandomForest(X_train,Y_train,X_test)  
accuracy_RandomForest.append(accuracy_score)

In [11]:
#✅ 11.Calling a Created Function - PCA_Classification(With Below Parameters): which returns - dataframe
result = PCA_Classification(Number_Of_Components, accuracy_LogisticRegression, accuracy_SVM_Linear, accuracy_SVM_NonLinear, 
                           accuracy_KNN, accuracy_NaiveBayes, accuracy_DecisionTree, accuracy_RandomForest)

In [12]:
#✅ 12.Calling the Final Results by Selecting Top 6 Features
result

Unnamed: 0_level_0,Logistic Regression,SVM Linear,SVM Non Linear,KNN,Naive Bayes,Decision Tree,Random Forest
Number of PCA Components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,0.946667,0.973333,0.986667,0.946667,0.973333,0.973333,0.973333
