In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier   
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

Function Explanation
Model Initialization:

 initialize four models: LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, and SVC.
These models are stored in a list rfemodellist.
RFE Application:

iterate over each model in the rfemodellist.
For each model, RFE is applied to select n features.
The selected features (i.e., the transformed data) are appended to the rfelist.
Return:

The function returns a list of transformed feature sets for each model.

In [24]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    
    # Define the models
    log_model = LogisticRegression(solver='lbfgs')
    RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt', splitter='best', random_state=0)
    svc_model = SVC(kernel='linear', random_state=0)
    
    # List of models to use in RFE
    rfemodellist = [log_model, svc_model, RF, DT]
    
    # Apply RFE for each model
    for model in rfemodellist:
        print(model)  # To show which model is currently being processed
        # Corrected RFE initialization
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)
    
    return rfelist

Function Explanation
Data Splitting:

The function uses train_test_split from sklearn.model_selection to split the input dataset indep_X and target variable dep_Y into training and testing sets.
The test_size=0.25 parameter means that 25% of the data will be used for testing, and the remaining 75% will be used for training.
The random_state=0 ensures that the split is reproducible.
Feature Scaling:

After splitting, the StandardScaler is used to normalize the features by removing the mean and scaling to unit variance.
sc.fit_transform(X_train) scales the training data, and sc.transform(X_test) scales the test data using the same parameters learned from the training data.
Return Values:

The function returns the scaled training and testing sets (X_train, X_test) along with the corresponding target variables (y_train, y_test).

In [3]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        #X_train, X_test, y_train, y_test = train_test_split(indep_X,dep_Y, test_size = 0.25, random_state = 0)
        
        #Feature Scaling
        #from sklearn.preprocessing import StandardScaler
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        
        return X_train, X_test, y_train, y_test

Prediction:

y_pred = classifier.predict(X_test) uses the trained classifier to make predictions on the test set.
Confusion Matrix:
cm = confusion_matrix(y_test, y_pred) generates the confusion matrix, which shows the number of true positive, true negative, false positive, and false negative predictions.

Accuracy:
accuracy = accuracy_score(y_test, y_pred) calculates the accuracy of the model, which is the ratio of correctly predicted observations to the total observations.

Classification Report:
report = classification_report(y_test, y_pred) provides detailed metrics such as precision, recall, F1-score, and support for each class.

Return:
The function returns the classifier, accuracy, classification report, test features, test labels, and confusion matrix.

In [4]:
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        #from sklearn.metrics import confusion_matrix
        #cm = confusion_matrix(y_test, y_pred)
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm

Model Initialization and Training:

The LogisticRegression model is initialized with random_state=0 to ensure reproducibility.
The model is then trained using classifier.fit(X_train, y_train).
Prediction and Evaluation:
The cm_prediction function is called to make predictions on the X_test dataset and evaluate the model's performance. 
Note that y_test must be passed to cm_prediction so that the true labels can be compared to the predictions.

Return Values:
The function returns the trained classifier, accuracy, report, X_test, y_test, and the cm (confusion matrix).

In [5]:

def logistic(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm 

Model Initialization and Training:

The SVC model with a linear kernel is initialized. The random_state=0 ensures that the results are reproducible.
The model is trained using the fit method with X_train and y_train.

Prediction and Evaluation:
The cm_prediction function is called to predict the labels for X_test and evaluate the model's performance against y_test. It's important to pass y_test into this function so that the predicted labels can be compared with the true labels.

Return Values:
The function returns the trained classifier, the accuracy score, the classification report, the test data (X_test, y_test), and the confusion matrix (cm).

In [6]:

def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

Model Initialization and Training:
The SVC model with an rbf (Radial Basis Function) kernel is initialized. The random_state=0 ensures that the results are reproducible.
The model is trained using the fit method with X_train and y_train.

Prediction and Evaluation:
The cm_prediction function is called to predict the labels for X_test and evaluate the model's performance against y_test. It's essential to pass y_test into this function to compare the predicted labels with the true labels.

Return Values:
The function returns the trained classifier, the accuracy score, the classification report, the test data (X_test, y_test), and the confusion matrix (cm).

In [7]:
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

Model Initialization and Training:
The GaussianNB model is initialized, and the fit method is used to train the model on X_train and y_train.

Prediction and Evaluation:
The cm_prediction function is called to predict the labels for X_test and evaluate the model's performance against y_test. Including y_test as an argument is crucial for comparing the predictions with the actual labels.

Return Values:
The function returns the trained classifier, the accuracy score, the classification report, the test data (X_test, y_test), and the confusion matrix (cm).

In [8]:
def Navie(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

Model Initialization and Training:
The KNeighborsClassifier is initialized with n_neighbors=5, using the Minkowski distance metric (with p=2, which is equivalent to the Euclidean distance).
The model is trained on the training data using fit(X_train, y_train).

Prediction and Evaluation:
The cm_prediction function is called to predict the labels for X_test and evaluate the model's performance against y_test. Passing y_test is essential for comparing the predicted labels with the true labels.

Return Values:
The function returns the trained classifier, the accuracy score, the classification report, the test data (X_test, y_test), and the confusion matrix (cm).

In [9]:
def knn(X_train,y_train,X_test):
           
        # Fitting K-NN to the Training set
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

Model Initialization and Training:
The DecisionTreeClassifier is initialized with criterion='entropy', which means the tree is built using information gain to determine splits.
The model is trained with classifier.fit(X_train, y_train).

Prediction and Evaluation:
The cm_prediction function is used to make predictions on X_test and evaluate the model's performance against y_test. The y_test parameter is essential for calculating performance metrics.

Return Values:
The function returns the trained classifier, the accuracy score, the classification report, the test data (X_test, y_test), and the confusion matrix (cm).

In [10]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

Model Initialization and Training:
The RandomForestClassifier is initialized with n_estimators=10, which means the forest will consist of 10 trees. 
The criterion='entropy' parameter specifies that the model uses information gain to build the trees.
The model is trained using classifier.fit(X_train, y_train).

Prediction and Evaluation:
The cm_prediction function is used to predict labels for X_test and evaluate the model's performance against y_test. Including y_test is crucial for calculating performance metrics.

Return Values:
The function returns the trained classifier, the accuracy score, the classification report, the test data (X_test, y_test), and the confusion matrix (cm).

In [11]:
def random(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [12]:

def rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf): 
    
    rfedataframe=pd.DataFrame(index=['Logistic','SVC','Random','DecisionTree'],columns=['Logistic','SVMl','SVMnl',
                                                                                        'KNN','Navie','Decision','Random'])

    for number,idex in enumerate(rfedataframe.index):
        
        rfedataframe['Logistic'][idex]=acclog[number]       
        rfedataframe['SVMl'][idex]=accsvml[number]
        rfedataframe['SVMnl'][idex]=accsvmnl[number]
        rfedataframe['KNN'][idex]=accknn[number]
        rfedataframe['Navie'][idex]=accnav[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
    return rfedataframe

In [14]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [16]:
indep_X=df2.drop('classification_yes', axis=1)
indep_X

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,True,False,False,False,False,False,True,False,False
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,True,False,False,False,False,False,True,False,False
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,True,False,False,False,False,False,True,False,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,True,False,False,False,False,False,True,False,False
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,True,False,False,True,True,False,True,False,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,True,False,False,True,True,False,False,False,False
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,True,False,False,True,True,False,True,False,True


In [17]:
dep_Y=df2['classification_yes']
dep_Y

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [29]:
rfelist=rfeFeature(indep_X,dep_Y,3)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)   
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
    accsvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
    accsvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
    accknn.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
    accnav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
    accdes.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
    accrf.append(Accuracy)
    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

print ('3 RFE classification result is:\n',result)

LogisticRegression()
SVC(kernel='linear', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)
3 RFE classification result is:
              Logistic  SVMl SVMnl   KNN Navie Decision Random
Logistic         0.94  0.94  0.94  0.94  0.94     0.94   0.94
SVC              0.87  0.87  0.87  0.87  0.87     0.87   0.87
Random           0.91  0.92  0.93  0.93  0.86     0.91   0.94
DecisionTree     0.93  0.93  0.94  0.95  0.74     0.95   0.97


In [30]:
rfelist=rfeFeature(indep_X,dep_Y,4)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)   
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
    accsvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
    accsvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
    accknn.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
    accnav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
    accdes.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
    accrf.append(Accuracy)
    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

print ('4 RFE classification result is:\n',result)

LogisticRegression()
SVC(kernel='linear', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)
4 RFE classification result is:
              Logistic  SVMl SVMnl   KNN Navie Decision Random
Logistic         0.95  0.95  0.95  0.95  0.95     0.95   0.95
SVC              0.96  0.96  0.96  0.96  0.96     0.96   0.96
Random           0.93  0.93  0.94  0.93  0.91     0.91   0.94
DecisionTree     0.97  0.97  0.97  0.96  0.84     0.96   0.96


In [31]:
rfelist=rfeFeature(indep_X,dep_Y,5)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)   
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
    accsvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
    accsvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
    accknn.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
    accnav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
    accdes.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
    accrf.append(Accuracy)
    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

print ('5 RFE classification result is:\n',result)

LogisticRegression()
SVC(kernel='linear', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)
5 RFE classification result is:
              Logistic  SVMl SVMnl   KNN Navie Decision Random
Logistic         0.98  0.98  0.98  0.98  0.98     0.98   0.98
SVC              0.99  0.99  0.99  0.99  0.99     0.99   0.99
Random           0.97  0.97  0.97  0.96  0.87     0.93   0.97
DecisionTree     0.97  0.98  0.98  0.98  0.91     0.96   0.98
