In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [2]:
def selectkbest(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        fit1= test.fit(indep_X,dep_Y)
        # summarize scores       
        selectk_features = fit1.transform(indep_X)
        return selectk_features
    

In [None]:
'''
selectkbest function is using SelectKBest from sklearn.feature_selection with chi2 as the scoring function.
This function reduces the dimensionality of indep_X by selecting the top n features based on the highest scores according to chi2.
The function returns the transformed feature set selectk_features, which includes only the n best features.
'''

In [3]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [None]:
'''
split_scalar function is designed to split the dataset into training and testing sets and then scale the features 
using StandardScaler from sklearn.preprocessing.
indep_X: The independent variables (features).
dep_Y: The dependent variable (target).
train_test_split: Splits the dataset into training and testing sets. The test size is set to 25% of the data (test_size=0.25),
and random_state=0 ensures reproducibility.
StandardScaler: Scales the features so that they have a mean of 0 and a standard deviation of 1, which is important for algorithms
like SVM, KNN, and neural networks.
Return Value: The function returns four values: X_train, X_test, y_train, and y_test.
'''

In [4]:
def r2_prediction(regressor,X_test,y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(y_test,y_pred)
     return r2

In [None]:
'''
Input Parameters:

regressor: The trained regression model.
X_test: The test set features.
y_test: The actual values of the target variable for the test set.
Functionality:

regressor.predict(X_test): Uses the trained model to predict the target variable based on the test features.
r2_score(y_test, y_pred): Computes the R-squared value, which measures the proportion of variance 
in the dependent variable that is predictable from the independent variables.
Return Value: The function returns the R-squared score (r2), a value between 0 and 1 
(though it can be negative if the model is very poor), where 1 indicates perfect prediction.
'''

In [5]:
def Linear(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
'''
Linear function is designed to train a linear regression model using scikit-learn
and then evaluate its performance using the R-squared metric.
Input Parameters:

X_train: Training set features.
y_train: Training set target variable.
X_test: Test set features.
Functionality:

Model Training: A LinearRegression model is created and trained on the training data (X_train, y_train).
R-squared Calculation: The model is then evaluated using the r2_prediction function,
which calculates the R-squared score by comparing the model's predictions on X_test with the actual y_test values.
Return Value: The function returns the R-squared score (r2), which measures the goodness of fit of the model.
'''

In [6]:
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
'''
svm_linear function is designed to train a Support Vector Regression (SVR) model 
with a linear kernel and evaluate its performance using the R-squared metric. 
nput Parameters:

X_train: Training set features.
y_train: Training set target variable.
X_test: Test set features.
Functionality:

Model Training: An SVR model with a linear kernel is created and trained on the training data (X_train, y_train).
R-squared Calculation: The model is then evaluated using the r2_prediction function, 
which calculates the R-squared score by comparing the model's predictions on X_test with the actual y_test values.
Return Value: The function returns the R-squared score (r2), indicating the model's performance.
'''

In [7]:
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
'''
svm_NL function is designed to train a Support Vector Regression (SVR) model with a nonlinear kernel 
(specifically, the Radial Basis Function (RBF) kernel) and evaluate its performance using the R-squared metric.
Input Parameters:

X_train: Training set features.
y_train: Training set target variable.
X_test: Test set features.
Functionality:

Model Training: An SVR model with an RBF kernel is created and trained on the training data (X_train, y_train). 
The RBF kernel is useful for capturing nonlinear relationships between features and the target variable.
R-squared Calculation: The model is then evaluated using the r2_prediction function,
which calculates the R-squared score by comparing the model's predictions on X_test with the actual y_test values.
Return Value: The function returns the R-squared score (r2), which measures the goodness of fit of the model.
'''

In [8]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training setC
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
'''
Decision function is designed to train a Decision Tree Regression model and evaluate 
its performance using the R-squared metric.
Input Parameters:

X_train: Training set features.
y_train: Training set target variable.
X_test: Test set features.
Functionality:

Model Training: A DecisionTreeRegressor model is created and trained on the training data (X_train, y_train). 
The random_state=0 ensures reproducibility.
R-squared Calculation: The model is then evaluated using the r2_prediction function, which calculates 
the R-squared score by comparing the model's predictions on X_test with the actual y_test values.
Return Value: The function returns the R-squared score (r2), indicating how well the model fits the data.

'''

In [9]:
def random(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
'''
random function is designed to train a Random Forest Regression model and evaluate its performance using the R-squared metric
Input Parameters:

X_train: Training set features.
y_train: Training set target variable.
X_test: Test set features.
Functionality:

Model Training: A RandomForestRegressor model is created and trained on the training data (X_train, y_train). 
The n_estimators=10 parameter specifies that the model should use 10 decision trees, and random_state=0 ensures reproducibility.
R-squared Calculation: The model is then evaluated using the r2_prediction function,
which calculates the R-squared score by comparing the model's predictions on X_test with the actual y_test values.
Return Value: The function returns the R-squared score (r2), which measures the goodness of fit of the model.
'''

In [97]:
def selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])

    for number,idex in enumerate(dataframe.index): 
        dataframe['Linear'][idex]=acclin[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [None]:
'''
selectk_regression function is intended to create a DataFrame summarizing the R-squared scores for 
different regression models based on the number of selected features (k). However, there are a few 

Single Row DataFrame: The DataFrame is designed to hold only one row, but number will increase 
with each call to the function.
Storing Values: The function overwrites the dataframe contents with the latest values in each loop.
'''

In [20]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df2

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [23]:
df2 = pd.get_dummies(df2, drop_first=True)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [24]:
indep_X=df2.drop('classification_yes',axis=1)
indep_X

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,True,False,False,False,False,False,True,False,False
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,True,False,False,False,False,False,True,False,False
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,True,False,False,False,False,False,True,False,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,True,False,False,False,False,False,True,False,False
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,True,False,False,True,True,False,True,False,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,True,False,False,True,True,False,False,False,False
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,True,False,False,True,True,False,True,False,True


In [26]:
dep_Y=df2['classification_yes']
dep_Y

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [81]:
kbest=selectkbest(indep_X,dep_Y,10)      

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

In [82]:
kbest

array([[2.00000000e+00, 3.00000000e+00, 0.00000000e+00, ...,
        3.88689024e+01, 8.40819113e+03, 0.00000000e+00],
       [3.00000000e+00, 2.00000000e+00, 0.00000000e+00, ...,
        3.40000000e+01, 1.23000000e+04, 0.00000000e+00],
       [4.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        3.40000000e+01, 8.40819113e+03, 0.00000000e+00],
       ...,
       [5.14923077e+01, 3.00000000e+00, 0.00000000e+00, ...,
        2.60000000e+01, 9.20000000e+03, 1.00000000e+00],
       [5.14923077e+01, 0.00000000e+00, 0.00000000e+00, ...,
        3.88689024e+01, 8.40819113e+03, 1.00000000e+00],
       [5.14923077e+01, 0.00000000e+00, 0.00000000e+00, ...,
        5.30000000e+01, 8.50000000e+03, 0.00000000e+00]])

In [98]:
X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)  
for i in kbest:   
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
       
result=selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf)
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.304963,0.256858,0.430795,0.479167,0.599392


In [40]:
print('Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg:',r2_sl,'\n','SVM_NonLinear:',r2_NL,'\n','DecisionTree_Reg:',r2_d,'\n','Random_Forest_Reg:',r2_r)
#K=6

Linear_Reg       : 0.5990411236352542 
 SVM_Linear_Reg    : 0.5864458332532442 
 SVM_NonLinear   : 0.8389618256518051 
 DecisionTree_Reg   : 0.8697916666666666 
 Random_Forest_Reg: 0.8975694444444444


In [43]:
print('Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg    :',r2_sl,'\n','SVM_NonLinear   :',r2_NL,'\n','DecisionTree_Reg   :',r2_d,'\n','Random_Forest_Reg:',r2_r)
#K=4

Linear_Reg       : 0.3049629003156724 
 SVM_Linear_Reg    : 0.2568580432938199 
 SVM_NonLinear   : 0.43079491056929564 
 DecisionTree_Reg   : 0.47916666666666663 
 Random_Forest_Reg: 0.5993923611111112


In [46]:
print('Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg    :',r2_sl,'\n','SVM_NonLinear   :',r2_NL,'\n','DecisionTree_Reg   :',r2_d,'\n','Random_Forest_Reg:',r2_r)
#K=7

Linear_Reg       : 0.6570348953434938 
 SVM_Linear_Reg    : 0.641906258562652 
 SVM_NonLinear   : 0.8930067943773491 
 DecisionTree_Reg   : 0.8263888888888888 
 Random_Forest_Reg: 0.9162326388888888


In [49]:
print('Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg    :',r2_sl,'\n','SVM_NonLinear   :',r2_NL,'\n','DecisionTree_Reg   :',r2_d,'\n','Random_Forest_Reg:',r2_r)
#K=8

Linear_Reg       : 0.6464573628315706 
 SVM_Linear_Reg    : 0.6121989307372211 
 SVM_NonLinear   : 0.8912737232140092 
 DecisionTree_Reg   : 0.8697916666666666 
 Random_Forest_Reg: 0.8988715277777778


In [52]:
print('Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg    :',r2_sl,'\n','SVM_NonLinear   :',r2_NL,'\n','DecisionTree_Reg   :',r2_d,'\n','Random_Forest_Reg:',r2_r)
#K=9

Linear_Reg       : 0.64612252708107 
 SVM_Linear_Reg    : 0.6024624378302159 
 SVM_NonLinear   : 0.9018193344671044 
 DecisionTree_Reg   : 0.8697916666666666 
 Random_Forest_Reg: 0.9197048611111112


In [55]:
print('Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg    :',r2_sl,'\n','SVM_NonLinear   :',r2_NL,'\n','DecisionTree_Reg   :',r2_d,'\n','Random_Forest_Reg:',r2_r)
#K=10

Linear_Reg       : 0.6447345534966664 
 SVM_Linear_Reg    : 0.5977261965387739 
 SVM_NonLinear   : 0.9193122662291634 
 DecisionTree_Reg   : 0.8697916666666666 
 Random_Forest_Reg: 0.9227430555555556


In [124]:
def selectk_regression(featureCount,acclin, accsvml, accsvmnl, accdes, accrf): 
    # Create a DataFrame with feature counts as the index
    dataframe = pd.DataFrame({
        'Kbest No':featureCount,
        'Linear': acclin,
        'SVMl': accsvml,
        'SVMnl': accsvmnl,
        'Decision': accdes,
        'Random': accrf
    })

    # Add appropriate index naming (for instance, number of features)
    dataframe.index.name = 'Index'
    #dataframe.index = featureCount

    return dataframe
featureCount = [4, 5, 6, 7, 8, 9, 10]
acclin = []
accsvml = []
accsvmnl = []
accdes = []
accrf = []
result=[]

for x in featureCount:
    kbest = selectkbest(indep_X, dep_Y, x)  # Assuming selectkbest returns selected features
    X_train, X_test, y_train, y_test = split_scalar(kbest, dep_Y)
    
    # Train and evaluate models
    r2_lin = Linear(X_train, y_train, X_test)
    acclin.append(r2_lin)
    
    r2_sl = svm_linear(X_train, y_train, X_test)
    accsvml.append(r2_sl)
    
    r2_NL = svm_NL(X_train, y_train, X_test)
    accsvmnl.append(r2_NL)
    
    r2_d = Decision(X_train, y_train, X_test)
    accdes.append(r2_d)
    
    r2_r = random(X_train, y_train, X_test)
    accrf.append(r2_r)
    
    #print('Number of Kbest Features:',x,'\n','Linear_Reg       :',r2_lin,'\n','SVM_Linear_Reg   :',r2_sl,'\n','SVM_NonLinear    :',r2_NL,'\n','DecisionTree_Reg :',r2_d,'\n','Random_Forest_Reg:',r2_r)

    #result=selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf)
    #result
    # Append the result of selectk_regression to the result list
best_model_result = selectk_regression(featureCount,acclin, accsvml, accsvmnl, accdes, accrf)
result.append(best_model_result)
print( best_model_result)

#best_model_result = selectk_regression(acclin, accsvml, accsvmnl, accdes, accrf, featureCount)
#print('KBestNos:\n', dataframe_to_prettytable(best_model_result))


       Kbest No    Linear      SVMl     SVMnl  Decision    Random
Index                                                            
0             4  0.304963  0.256858  0.430795  0.479167  0.599392
1             5  0.551985  0.545395  0.749654  0.696181  0.836806
2             6  0.599041  0.586446  0.838962  0.869792  0.897569
3             7  0.657035  0.641906  0.893007  0.826389  0.916233
4             8  0.646457  0.612199  0.891274  0.869792  0.898872
5             9  0.646123  0.602462  0.901819  0.869792  0.919705
6            10  0.644735  0.597726  0.919312  0.869792  0.922743
