In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import  LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import pickle
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

'''
Recursive Feature Elimination (RFE) is a feature selection technique used in machine learning to identify 
and select the most important features for building predictive models. It works by recursively considering 
smaller and smaller sets of features, starting with the full set and gradually eliminating the least important ones.

How RFE Works:
Model Training: RFE begins by training a model (e.g., linear regression, support vector machine, decision tree) 
on the entire set of features.

Feature Ranking: The model evaluates the importance of each feature. The importance can be measured in 
various ways depending on the model used, such as coefficients for linear models or feature importance scores for tree-based models.

Feature Elimination: The least important feature(s) are removed from the feature set.

Recursive Process: Steps 1-3 are repeated on the remaining set of features until a predefined number of 
features (or all features) have been eliminated, or the desired number of features has been selected.

Selection of Features: Finally, the remaining features are considered the most relevant for predicting the target variable.
'''

In [2]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        #X_train, X_test, y_train, y_test = train_test_split(indep_X,dep_Y, test_size = 0.25, random_state = 0)
        
        #Feature Scaling
        #from sklearn.preprocessing import StandardScaler
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

'''
split_scalar function is designed to split your dataset into training and testing sets, and then apply feature scaling to the input features.


Explanation of the Function:
Data Splitting:

The function uses train_test_split from scikit-learn to split the dataset into training and testing subsets.
By default, 25% of the data is allocated to the test set, and 75% is used for training. 
The random_state=0 ensures that the splitting process is reproducible.
Feature Scaling:

The function scales the features using StandardScaler from scikit-learn.
This scaler standardizes the features by removing the mean and scaling to unit variance.
It is important to fit the scaler on the training data (X_train) and then apply the same transformation 
to both the training and test sets to avoid data leakage.
'''

'''
r2_prediction function is designed to evaluate the performance of a regression model using the 
ùëÖ2(coefficient of determination) metric.
Prediction:

The function takes in a trained regressor (model), test features (X_test), and test labels (y_test). 
It then uses the model to predict the labels for the test set with regressor.predict(X_test).
R-squared Calculation:

The function calculates the ùëÖ2score using r2_score from scikit-learn, 
which compares the predicted labels (y_pred) to the true labels (y_test). The 
ùëÖ2score indicates how well the regression model explains the variability of the target variable.
Return:

Finally, the function returns the calculated ùëÖ2score.
'''

In [3]:
def r2_prediction(regressor,X_test,y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(y_test,y_pred)
     return r2

In [4]:
def Linear(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [5]:
def svm_linear(X_train,y_train,X_test):        
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

'''
SVR with RBF Kernel:

The function initializes an SVR model with an RBF kernel, which is commonly used for non-linear relationships in the data.
It then fits the model using the training data (X_train and y_train).
Prediction and R-squared Calculation:

After fitting the model, the function uses the r2_prediction function to calculate the ùëÖ2
  score using the test data (X_test and y_test).
Return:

Finally, the function returns the ùëÖ2
  score, giving you a measure of how well the non-linear SVR model fits the test data.
'''

In [6]:
def svm_NL(X_train,y_train,X_test):      
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [7]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training setC
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [8]:
def random(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

'''

rfeFeature is designed to perform Recursive Feature Elimination (RFE) using different regression models
and return the transformed feature sets based on the selected features for each model.

indep_X: Input features (independent variables).
dep_Y: Target values (dependent variable).
n: Number of features to select.

rfelist will store the feature sets selected by RFE for each model.

Four different regression models are initialized:
lin: Linear Regression.
SVRl: Support Vector Regression with a linear kernel.
dec: Decision Tree Regressor.
rf: Random Forest Regressor with 10 estimators.

rfemodellist contains all the models to be used with RFE.
Iterates over each model in rfemodellist.
Initializes RFE with the current model and the number of features to select (n).
Fits the RFE model to the input data (indep_X and dep_Y).
Transforms the input data to retain only the selected features based on RFE.
Appends the transformed feature set to rfelist.
Returns rfelist, which contains the feature sets selected by RFE for each model.
'''

In [10]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    
    # Define the models
    lin = LinearRegression()
    SVRl = SVR(kernel='linear')
    dec = DecisionTreeRegressor(random_state=0)
    rf = RandomForestRegressor(n_estimators=10, random_state=0)
    
    # List of models to use in RFE
    rfemodellist = [lin, SVRl, dec, rf]
    
    # Apply RFE for each model
    for model in rfemodellist:
        print(model)  # To display the current model being processed
        
        # Correct RFE initialization
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        
        # Fit the RFE model
        log_fit = log_rfe.fit(indep_X, dep_Y)
        
        # Transform the features based on the RFE selection
        log_rfe_feature = log_fit.transform(indep_X)
        
        # Append the selected features to the list
        rfelist.append(log_rfe_feature)
    
    return rfelist

'''
he rfe_regression function creates a DataFrame that organizes and presents the results of Recursive Feature Elimination (RFE)
from different regression models

rfe_regression takes four lists of accuracy scores from different models and places them into a pandas DataFrame, organized by model types
acclog: List of accuracy scores for the Linear Regression model.
accsvml: List of accuracy scores for the Support Vector Regression (linear kernel) model.
accdes: List of accuracy scores for the Decision Tree Regressor model.
accrf: List of accuracy scores for the Random Forest Regressor model.
Creates an empty DataFrame rfedataframe with rows representing different models ('Linear', 'SVC', 'Random', 'DecisionTree')
and columns representing the accuracy scores for each model type.
Iterates over the index of the DataFrame and populates each cell with corresponding accuracy scores from the input lists:
acclog[number] for 'Linear'.
accsvml[number] for 'SVMl'.
accdes[number] for 'Decision'.
accrf[number] for 'Random'.
Returns the populated DataFrame.
'''

In [11]:
def rfe_regression(acclog,accsvml,accdes,accrf): 
    
    rfedataframe=pd.DataFrame(index=['Linear','SVC','Random','DecisionTree'],columns=['Linear','SVMl',
                                                                                        'Decision','Random'])

    for number,idex in enumerate(rfedataframe.index):
        
        rfedataframe['Linear'][idex]=acclog[number]       
        rfedataframe['SVMl'][idex]=accsvml[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
    return rfedataframe

In [12]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [13]:
indep_X=df2.drop('classification_yes', axis=1)
indep_X

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,True,False,False,False,False,False,True,False,False
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,True,False,False,False,False,False,True,False,False
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,True,False,False,False,False,False,True,False,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,True,False,False,False,False,False,True,False,False
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,True,False,False,True,True,False,True,False,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,True,False,False,True,True,False,False,False,False
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,True,False,False,True,True,False,True,False,True


In [14]:
dep_Y=df2['classification_yes']
dep_Y

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

'''
Extracts Features with RFE:

rfeFeature is used to perform Recursive Feature Elimination (RFE) and get a list of transformed feature sets (rfelist).
Train-Test Split and Model Evaluation:

For each transformed feature set, you split the data into training and testing sets.
Train and evaluate various regression models (Linear Regression, SVM with linear kernel, 
SVM with non-linear kernel, Decision Tree, Random Forest) and store their performance metrics (e.g., R¬≤ scores) in lists.
Compile Results into DataFrame:

rfe_regression is used to compile the performance metrics into a DataFrame.
'''

In [27]:
rfelist=rfeFeature(indep_X,dep_Y,5)       

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]


for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf)
print ('5 feature result is:\n',result)


LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
5 feature result is:
                 Linear      SVMl  Decision    Random
Linear        0.620124  0.457136   0.77924  0.780135
SVC           0.604508  0.456871  0.776474  0.776745
Random        0.674403  0.628206  0.696181  0.815538
DecisionTree  0.686361  0.643365  0.836806  0.845303


In [28]:
rfelist=rfeFeature(indep_X,dep_Y,4)       

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]


for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf)
print ('4 feature result is:\n',result)

LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
4 feature result is:
                 Linear      SVMl  Decision    Random
Linear         0.60401  0.457046  0.776711  0.776492
SVC            0.60401  0.457046  0.776711  0.776492
Random        0.671727  0.628963  0.835247    0.8403
DecisionTree  0.681563  0.614992   0.96711  0.923559


In [29]:
rfelist=rfeFeature(indep_X,dep_Y,6)       

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]


for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf)
print ('6 feature result is:\n',result)

LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
6 feature result is:
                 Linear      SVMl  Decision    Random
Linear        0.624738  0.456874   0.81723  0.814741
SVC           0.610294  0.530043  0.806415  0.807916
Random        0.697365  0.665248  0.782986  0.829427
DecisionTree  0.705126  0.670093  0.839675  0.875221


In [30]:
rfelist=rfeFeature(indep_X,dep_Y,7)       

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]


for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf)
print ('7 feature result is:\n',result)

LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
7 feature result is:
                 Linear      SVMl  Decision    Random
Linear        0.622757    0.5373  0.813952  0.814557
SVC           0.623155    0.5296   0.81284    0.8134
Random        0.697704  0.666684  0.913194  0.940972
DecisionTree  0.705879  0.667997  0.797454  0.850957


In [32]:
rfelist=rfeFeature(indep_X,dep_Y,8)       

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]


for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf)
print ('8 feature result is:\n',result)

LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
8 feature result is:
                 Linear      SVMl  Decision    Random
Linear        0.709204  0.684292  0.952168  0.932773
SVC           0.701052  0.679964   0.82978  0.922139
Random        0.703917  0.673437  0.782986  0.918403
DecisionTree  0.712812  0.671713  0.913194  0.945312
