In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, RFE

import pickle
import time 

In [2]:
# Creating Function(s) :

def SelectKBest_Regression(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        print(f"Selecting top {n} features using Chi-Square test")
        
        fit1= test.fit(indep_X,dep_Y)
        selectk_features = fit1.transform(indep_X)
        return selectk_features

def train_test_split_and_StandardScaler(indep_X,dep_Y):
        X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, Y_train, Y_test
    
def R2_Prediction(regressor,X_test,Y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    R2_Score = r2_score(Y_test,y_pred)
    return R2_Score

def LinearRegression(X_train,Y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)
        
    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score  

def SVM_Linear(X_train,Y_train,X_test):
    
    from sklearn.svm import SVR
    regressor = SVR(kernel = 'linear')
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def SVM_Non_Linear(X_train,Y_train,X_test):

    from sklearn.svm import SVR
    regressor = SVR(kernel = 'rbf')
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def DecisionTree(X_train,Y_train,X_test):

    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def RandomForest(X_train,Y_train,X_test):

    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(X_train, Y_train)

    # Calling a Created Function - R2_Prediction(regressor,X_test,Y_test) which returns - R2_Score
    R2_Score = R2_Prediction(regressor,X_test,Y_test)
    return R2_Score

def SelectK_Regression(R2_LinearRegression, R2_SVM_Linear, R2_SVM_NonLinear, R2_DecisionTree, R2_RandomForest): 

    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear Regression','SVM Linear','SVM Non Linear',
                                                        'Decision Tree','Random Forest'])
    
    #Function - enumerate() acts as a Counter which Iterates index starting from 0 (by default) and their item(s) from the iterable
    #Use enumerate() when We need both Position in the loop (number) and its value from the iterable (idex)
    
    for indexCount,indexValue in enumerate(dataframe.index):      
        dataframe['Linear Regression'][indexValue]=R2_LinearRegression[indexCount]       
        dataframe['SVM Linear'][indexValue]=R2_SVM_Linear[indexCount]
        dataframe['SVM Non Linear'][indexValue]=R2_SVM_NonLinear[indexCount]
        dataframe['Decision Tree'][indexValue]=R2_DecisionTree[indexCount]
        dataframe['Random Forest'][indexValue]=R2_RandomForest[indexCount]
    return dataframe

In [3]:
#1.Loading Original Dataset : 
dataset=pd.read_csv("Pre-processed_CKD_Data.csv",index_col=None)
print(dataset.shape)
dataset.head()

(399, 25)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [4]:
#2.Duplicating the Original Dataset
dataset2 = dataset

#3.Classifying the Nominal Columns in Dataset : 
dataset2 = pd.get_dummies(dataset2, drop_first=True)
print(dataset2.shape)
dataset2.head()

(399, 28)


Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,1,0,0,0,0,0,1,0,0,1
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,1,0,0,0,0,0,1,0,0,1
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,1,0,0,0,0,0,1,0,1,1
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,1,0,0,0,0,0,1,0,0,1


In [15]:
#4.Assigning Variables (Independent/Dependent) : 

indep_X = dataset2.drop('classification_yes', 1)
print(indep_X.shape)

dep_Y = dataset2['classification_yes']
print(dep_Y.shape)

(399, 27)
(399,)


In [40]:
#5.Calling a Created Function - select_K_Best(indep_X,dep_Y,n): which returns - selectk_features
#Here the number = 5 as Feature Selection which takes 5 Parameters as Input
k_Best = SelectKBest_Regression(indep_X,dep_Y,7)       

#Creating Empty Lists
R2_LinearRegression = []
R2_SVM_Linear = []
R2_SVM_NonLinear = []
R2_DecisionTree = []
R2_RandomForest = []

k_Best

Selecting top 7 features using Chi-Square test


array([[3.00000000e+00, 1.48112676e+02, 5.74821053e+01, ...,
        1.25181556e+01, 3.88689024e+01, 8.40819113e+03],
       [2.00000000e+00, 1.48112676e+02, 2.20000000e+01, ...,
        1.07000000e+01, 3.40000000e+01, 1.23000000e+04],
       [1.00000000e+00, 9.90000000e+01, 2.30000000e+01, ...,
        1.20000000e+01, 3.40000000e+01, 8.40819113e+03],
       ...,
       [3.00000000e+00, 1.10000000e+02, 1.15000000e+02, ...,
        9.10000000e+00, 2.60000000e+01, 9.20000000e+03],
       [0.00000000e+00, 2.07000000e+02, 8.00000000e+01, ...,
        8.50000000e+00, 3.88689024e+01, 8.40819113e+03],
       [0.00000000e+00, 1.00000000e+02, 4.90000000e+01, ...,
        1.63000000e+01, 5.30000000e+01, 8.50000000e+03]])

In [41]:
#6.Calling a Created Function - train_test_split_and_StandardScaler: which returns - X_train, X_test, Y_train, Y_test
#Here before creating a Model, We are Selecting Few Best Columns as Input 
#Hence, Passing (k_Best, dep_Y) along with Selected Number of Features instead of Usual (indep_X, dep_Y)
X_train, X_test, Y_train, Y_test = train_test_split_and_StandardScaler(k_Best, dep_Y)   

#Creating Various Models as follows :

#7.Calling a Created Function - LogisticRegression(X_train,Y_train,X_test): which returns - R2_score
R2_score = LinearRegression(X_train,Y_train,X_test)
R2_LinearRegression.append(R2_score)

#8.Calling a Created Function - SVM_Linear(X_train,Y_train,X_test): which returns - R2_score
R2_score = SVM_Linear(X_train,Y_train,X_test)  
R2_SVM_Linear.append(R2_score)

#9.Calling a Created Function - SVM_Non_Linear(X_train,Y_train,X_test): which returns - R2_score
R2_score = SVM_Non_Linear(X_train,Y_train,X_test)  
R2_SVM_NonLinear.append(R2_score)

#10.Calling a Created Function - DecisionTree(X_train,Y_train,X_test): which returns - R2_score
R2_score = DecisionTree(X_train,Y_train,X_test)  
R2_DecisionTree.append(R2_score)

#11.Calling a Created Function - RandomForest(X_train,Y_train,X_test): which returns - R2_score
R2_score = RandomForest(X_train,Y_train,X_test)  
R2_RandomForest.append(R2_score)
    
#12.Calling a Created Function - SelectK(With Below Parameters): which returns - dataframe    
result = SelectK_Regression(R2_LinearRegression, R2_SVM_Linear, R2_SVM_NonLinear, R2_DecisionTree, R2_RandomForest)

  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


In [33]:
# Calling the Final Results by Selecting Top 10 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.644735,0.597726,0.919312,0.869792,0.922743


In [36]:
# Calling the Final Results by Selecting Top 9 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.646123,0.602462,0.901819,0.869792,0.919705


In [39]:
# Calling the Final Results by Selecting Top 8 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.646457,0.612199,0.891274,0.869792,0.898872


In [42]:
# Calling the Final Results by Selecting Top 7 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.657035,0.641906,0.893007,0.826389,0.916233


In [18]:
# Calling the Final Results by Selecting Top 6 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.599041,0.586446,0.838962,0.869792,0.897569


In [21]:
# Calling the Final Results by Selecting Top 5 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.551985,0.545395,0.749654,0.696181,0.836806


In [24]:
# Calling the Final Results by Selecting Top 4 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.304963,0.256858,0.430795,0.479167,0.599392


In [27]:
# Calling the Final Results by Selecting Top 3 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.287968,0.255063,0.3335,0.262153,0.528212


In [30]:
# Calling the Final Results by Selecting Top 2 Features
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.218813,-0.11115,0.287555,-0.030816,0.155949


In [13]:
# Calling the Final Results by Selecting Top 1 Feature
result

Unnamed: 0,Linear Regression,SVM Linear,SVM Non Linear,Decision Tree,Random Forest
ChiSquare,0.0616371,-0.293404,-0.105923,-0.169763,-0.0920854
