In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
def selectkbest(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        fit1= test.fit(indep_X,dep_Y)
        selectk_features = fit1.transform(indep_X)
        return selectk_features

In [None]:
'''
SelectKBest(score_func=chi2, k=n): This initializes the SelectKBest feature selector. 
It uses the chi-squared (chi2) statistic to select the n best features.

fit1 = test.fit(indep_X, dep_Y): This fits the SelectKBest model to the data (indep_X and dep_Y). It learns the scores of the features.

selectk_features = fit1.transform(indep_X): This transforms indep_X to include only the top n features selected by SelectKBest.

return selectk_features: This returns the transformed dataset containing only the selected features.
'''

In [3]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [None]:
'''
Splitting the Dataset:

train_test_split: This function splits indep_X and dep_Y into training and testing sets.
test_size=0.25: This parameter specifies that 25% of the data should be used for testing, and 75% for training.
random_state=0: This ensures that the split is reproducible. The same split will occur every time you run the function with this seed.
Standardizing the Features:

StandardScaler(): This scales the features so that they have a mean of 0 and a standard deviation of 1.
fit_transform(X_train): This method computes the mean and standard deviation of the training set and then scales the training data.
transform(X_test): This scales the test data using the same mean and standard deviation as calculated from the training set,
ensuring that the test data is scaled consistently with the training data.
Returning the Results:

The function returns the scaled training and testing datasets (X_train, X_test) along with their corresponding labels (y_train, y_test).
'''

In [4]:
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        #from sklearn.metrics import confusion_matrix
        #cm = confusion_matrix(y_test, y_pred)
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm

In [None]:
'''
Predictions:

y_pred = classifier.predict(X_test): The classifier makes predictions on the test set (X_test).
Confusion Matrix:

cm = confusion_matrix(y_test, y_pred): A confusion matrix is created to visualize the performance 
of the classification model by comparing the actual labels (y_test) with the predicted labels (y_pred).
Accuracy:

Accuracy = accuracy_score(y_test, y_pred): Calculates the accuracy, which is the ratio of 
correctly predicted instances to the total instances.
Classification Report:

report = classification_report(y_test, y_pred): Generates a report showing precision, recall, F1-score, and support for each class.
Returning Results:

The function returns the classifier, accuracy, classification report, test data, actual labels, and the confusion matrix.
'''

In [5]:
def logistic(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm 

In [None]:
'''
Training the Logistic Regression Model:

LogisticRegression(random_state=0): This initializes the logistic regression model. 
The random_state=0 ensures that the results are reproducible.
classifier.fit(X_train, y_train): The logistic regression model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training the model, 
it is evaluated on the test data using the cm_prediction function. 
This function returns the classifier, accuracy, classification report, test data, actual labels, and the confusion matrix.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and confusion matrix.
'''

In [6]:
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [None]:
'''
Training the SVM Model:

SVC(kernel='linear', random_state=0): This initializes the SVM classifier with a linear kernel.
The random_state=0 ensures reproducibility of the results.
classifier.fit(X_train, y_train): The SVM model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training, the model is evaluated using the cm_prediction function,
which returns the classifier, accuracy, classification report, test data, actual labels, and the confusion matrix.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and confusion matrix.
'''

In [7]:
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [None]:
'''
Training the SVM Model:

SVC(kernel='rbf', random_state=0): This initializes the SVM classifier with a Radial Basis Function (RBF) kernel,
which is commonly used for non-linear classification tasks. The random_state=0 ensures that the results are reproducible.
classifier.fit(X_train, y_train): The SVM model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training, the model is evaluated on the test data 
using the cm_prediction function, which calculates accuracy, creates a confusion matrix, and generates a classification report.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and the confusion matrix.
'''

In [8]:
def Navie(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm 

In [None]:
'''
Training the Naive Bayes Model:

GaussianNB(): This initializes the Gaussian Naive Bayes classifier.
classifier.fit(X_train, y_train): The model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training, the model is evaluated on 
the test data using the cm_prediction function, which calculates accuracy, 
creates a confusion matrix, and generates a classification report.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and confusion matrix.
'''

In [9]:
def knn(X_train,y_train,X_test):
           
        # Fitting K-NN to the Training set
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [None]:
'''
Training the KNN Model:

KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2): 
Initializes the KNN classifier with 5 neighbors. The Minkowski distance with p=2 is equivalent to the Euclidean distance.
classifier.fit(X_train, y_train): The model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training, the model is evaluated 
on the test data using the cm_prediction function, which calculates accuracy, creates a confusion matrix, and generates a classification report.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and confusion matrix.
'''

In [10]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [None]:
'''
Training the Decision Tree Model:

DecisionTreeClassifier(criterion='entropy', random_state=0): Initializes the Decision Tree classifier
with the entropy criterion for measuring the quality of splits.
classifier.fit(X_train, y_train): The model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training, the model is evaluated on the test data 
using the cm_prediction function, which calculates accuracy, creates a confusion matrix, and generates a classification report.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and confusion matrix.
'''

In [11]:
def random(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [None]:
'''
Training the Random Forest Model:

RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0): 
Initializes the Random Forest classifier with 10 decision trees (n_estimators)
and uses the entropy criterion to measure the quality of splits.
classifier.fit(X_train, y_train): The model is trained on the training data (X_train, y_train).
Evaluating the Model:

cm_prediction(classifier, X_test): After training, the model is evaluated 
on the test data using the cm_prediction function, which calculates accuracy, 
creates a confusion matrix, and generates a classification report.
Returning the Results:

The function returns the trained classifier, accuracy, classification report, test data, actual labels, and confusion matrix.
'''

In [12]:
def selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    for number,idex in enumerate(dataframe.index):      
        dataframe['Logistic'][idex]=acclog[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['Navie'][idex]=accnav[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [None]:
'''
Creating the DataFrame:

pd.DataFrame(index=['ChiSquare'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random']): 
Initializes a DataFrame with a single row labeled 'ChiSquare' and columns for each classification model.
Populating the DataFrame:

Loop through dataframe.index: Since there's only one index ('ChiSquare'), it adds the classification metrics to the corresponding columns.
dataframe['Logistic'][index] = acclog[number]: Assigns the accuracy of the Logistic Regression model to the 'Logistic' column.
Returning the DataFrame:

return dataframe: Returns the populated DataFrame.
'''

In [13]:
dataset1=pd.read_csv("prep.csv",index_col=None)
dataset1

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [15]:
df2=dataset1

df2 = pd.get_dummies(df2, drop_first=True)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [17]:
indep_X=df2.drop('classification_yes', axis=1)
dep_Y=df2['classification_yes']

In [30]:
kbest=selectkbest(indep_X,dep_Y,10)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]
X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)   
    
        
classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
acclog.append(Accuracy)

classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
accsvml.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
accsvmnl.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
accknn.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
accnav.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
accdes.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
accrf.append(Accuracy)
    
result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.99,0.98,1.0,0.99,0.92,0.96,0.97
