In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

Even though refit=True fits the best model, GridSearchCV itself is not a model—it’s just a wrapper around multiple models tested with different hyperparameters.

GridSearchCV does not have coef_ or feature_importances_ (which RFE needs).
The actual trained model with the best parameters is stored inside best_estimator_.
If you try to use GridSearchCV directly in RFE, it won’t work because RFE expects a model, not a hyperparameter search object.

def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    
    # Define parameter grids for different models
    param_grid_log = {
        'C': [0.01, 0.1, 1, 10, 100], 
        'penalty': ['l1', 'l2'], 
        'solver': ['liblinear', 'saga', 'lbfgs'], 
        'max_iter': [50, 100]
    }
    
    param_grid_RF = {
        'criterion': ["gini", "entropy", "log_loss"], 
        'max_features': [None, "sqrt", "log2"], 
        'n_estimators': [10, 50]
    }
    
    param_grid_DT = {
        'criterion': ["gini", "entropy", "log_loss"], 
        'splitter': ["best", "random"], 
        'max_features': [None, "sqrt", "log2"]
    }
    
    param_grid_XGB = {
        'n_estimators': [100, 300], 
        'max_depth': [3, 6], 
        'learning_rate': [0.01, 0.1, 0.3], 
        'subsample': [0.8, 1.0], 
        'colsample_bytree': [0.7, 1.0], 
        'gamma': [1, 5]
    }
    
    # Create GridSearchCV objects for each model
    grid_log = GridSearchCV(LogisticRegression(random_state=42), param_grid_log, cv=5, scoring='f1', n_jobs=-1, refit=True)
    grid_RF = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_RF, cv=5, scoring='f1', n_jobs=-1, refit=True, verbose=3)
    grid_DT = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_DT, cv=5, scoring='f1', n_jobs=-1, refit=True, verbose=3)
    grid_XGB = GridSearchCV(XGBClassifier(random_state=42), param_grid_XGB, cv=5, scoring='f1', n_jobs=-1, refit=True, verbose=3)
    
    # List of models
    rfemodellist = [grid_log, grid_RF, grid_DT, grid_XGB]

    for grid in rfemodellist:
        print(f"Fitting GridSearchCV for {grid.estimator.__class__.__name__}...")
        
        # Fit GridSearchCV
        grid.fit(indep_X, dep_Y)

        # Get the best model
        best_model = grid.best_estimator_

        # Apply RFE using the best model
        log_rfe = RFE(best_model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        
        rfelist.append(log_rfe_feature)

    return rfelist


def rfeFeature(indep_X,dep_Y,n):
        rfelist=[]
        
        param_grid_log = {'C': [0.01, 0.1, 1, 10,100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga','lbfgs'],'max_iter': [50,100]}
        grid_log= GridSearchCV(LogisticRegression(), param_grid_log, cv=5, scoring='f1', n_jobs=-1,refit=True)
        param_grid_RF={'criterion':["gini", "entropy", "log_loss"],'max_features': [None,"sqrt", "log2"], 'n_estimators':[10,50]}
        grid_RF= GridSearchCV(RandomForestClassifier(), param_grid_RF, refit= True, verbose=3, n_jobs=-1, scoring= 'f1' )
        # NB = GaussianNB()
        param_grid_DT={'criterion':["gini", "entropy", "log_loss"], 'splitter': ["best", "random"],'max_features': [None,"sqrt", "log2"]}
        grid_DT= GridSearchCV(DecisionTreeClassifier(), param_grid_DT, refit= True, verbose=3, n_jobs=-1, scoring= 'f1' )
        from collections import Counter
        #counter = Counter(y_train)  # Count class occurrences
        #ratio = counter[0] / counter[1]
        param_grid_XGB = {'n_estimators': [100, 300],'max_depth': [3, 6],'learning_rate': [0.01, 0.1, 0.3],'subsample': [ 0.8, 1.0],'colsample_bytree': [0.7, 1.0],'gamma': [1, 5]}
        grid_XGB = GridSearchCV(XGBClassifier(), param_grid_XGB, refit= True, verbose=3, n_jobs=-1, scoring= 'accuracy' )
        #svc_model = SVC(kernel = 'linear', random_state = 0)
        #knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        rfemodellist=[grid_log,grid_RF,grid_DT,grid_XGB] 
        for i in   rfemodellist:
            print(i)
            log_rfe = RFE(i.best_estimator_, n_features_to_select=n)
            log_fit = log_rfe.fit(indep_X, dep_Y)
            log_rfe_feature=log_fit.transform(indep_X)
            rfelist.append(log_rfe_feature)
        return rfelist


In [26]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    
    # Define parameter grids for different models
    param_grid_log = {
        'C': [0.01, 0.1, 1, 10, 100], 
        'penalty': ['l1', 'l2'], 
        'solver': ['liblinear', 'saga', 'lbfgs'], 
        'max_iter': [50, 100]
    }
    
    param_grid_RF = {
        'criterion': ["gini", "entropy", "log_loss"], 
        'max_features': [None, "sqrt", "log2"], 
        'n_estimators': [10, 50]
    }
    
    param_grid_DT = {
        'criterion': ["gini", "entropy", "log_loss"], 
        'splitter': ["best", "random"], 
        'max_features': [None, "sqrt", "log2"]
    }
    
    param_grid_XGB = {
        'n_estimators': [100, 300], 
        'max_depth': [3, 6], 
        'learning_rate': [0.01, 0.1, 0.3], 
        'subsample': [0.8, 1.0], 
        'colsample_bytree': [0.7, 1.0], 
        'gamma': [1, 5]
    }
    
    # Create GridSearchCV objects for each model
    grid_log = GridSearchCV(LogisticRegression(random_state=42), param_grid_log, cv=5, scoring='f1', n_jobs=-1, refit=True)
    grid_RF = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_RF, cv=5, scoring='f1', n_jobs=-1, refit=True, verbose=3)
    grid_DT = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_DT, cv=5, scoring='f1', n_jobs=-1, refit=True, verbose=3)
    grid_XGB = GridSearchCV(XGBClassifier(random_state=42), param_grid_XGB, cv=5, scoring='f1', n_jobs=-1, refit=True, verbose=3)
    
    # List of models
    rfemodellist = [grid_log, grid_RF, grid_DT, grid_XGB]

    for grid in rfemodellist:
        print(f"Fitting GridSearchCV for {grid.estimator.__class__.__name__}...")
        
        # Fit GridSearchCV
        grid.fit(indep_X, dep_Y)

        # Get the best model
        best_model = grid.best_estimator_

        # Apply RFE using the best model
        log_rfe = RFE(best_model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        
        rfelist.append(log_rfe_feature)

    return rfelist
    

def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.2, random_state = 0)
        #X_train, X_test, y_train, y_test = train_test_split(indep_X,dep_Y, test_size = 0.2, random_state = 0)
        
        #Feature Scaling
        #from sklearn.preprocessing import StandardScaler
        #sc = StandardScaler()
        #X_train = sc.fit_transform(X_train)
        #X_test = sc.transform(X_test)
        #from imblearn.over_sampling import RandomOverSampler
        #ros = RandomOverSampler(random_state=42)
        #X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
                
        return X_train, X_test, y_train, y_test
    
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        #from sklearn.metrics import confusion_matrix
        #cm = confusion_matrix(y_test, y_pred)
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm

def logistic(X_train_resampled,y_train_resampled,X_test):       
        from sklearn.linear_model import LogisticRegression
        param_grid = {'C': [0.01, 0.1, 1, 10,100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga','lbfgs'],'max_iter': [50,100],'random_state':[42]}
        grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1', n_jobs=-1,refit=True)
        grid_search.fit(X_train_resampled, y_train_resampled)
        grid_search,Accuracy,report,X_test,y_test,cm=cm_prediction(grid_search,X_test)
        return  grid_search,Accuracy,report,X_test,y_test,cm 

def Decision(X_train_resampled,y_train_resampled,X_test):
        from sklearn.tree import DecisionTreeClassifier
        param_grid={'criterion':["gini", "entropy", "log_loss"], 'splitter': ["best", "random"],'max_features': [None,"sqrt", "log2"], 'random_state':[42]}
        grid_search= GridSearchCV(DecisionTreeClassifier(), param_grid, refit= True, verbose=3, n_jobs=-1, scoring= 'f1' )
        grid_search.fit(X_train_resampled,y_train_resampled)
        grid_search,Accuracy,report,X_test,y_test,cm=cm_prediction(grid_search,X_test)
        return  grid_search,Accuracy,report,X_test,y_test,cm

def random(X_train_resampled,y_train_resampled,X_test):
        
        from sklearn.ensemble import RandomForestClassifier
        param_grid={'criterion':["gini", "entropy", "log_loss"],'max_features': [None,"sqrt", "log2"], 'n_estimators':[10,50,100],'random_state':[42]}
        grid_search= GridSearchCV(RandomForestClassifier(), param_grid, refit= True, verbose=3, n_jobs=-1, scoring= 'f1' )
        grid_search.fit(X_train_resampled, y_train_resampled)
        grid_search,Accuracy,report,X_test,y_test,cm=cm_prediction(grid_search,X_test)
        return  grid_search,Accuracy,report,X_test,y_test,cm

def Xgboost(X_train_resampled,y_train_resampled,X_test):
        from xgboost import XGBClassifier
        from collections import Counter
        counter = Counter(y_train)  # Count class occurrences
        ratio = counter[0] / counter[1]
        param_grid = {'n_estimators': [100, 300],'max_depth': [3, 6],'learning_rate': [0.01, 0.1, 0.3],'subsample': [ 0.8, 1.0],'colsample_bytree': [0.7, 1.0],'gamma': [1, 5]}
        grid_search = GridSearchCV(XGBClassifier(), param_grid, refit= True, verbose=3, n_jobs=-1, scoring= 'f1' )
        grid_search.fit(X_train_resampled, y_train_resampled)
        grid_search,Accuracy,report,X_test,y_test,cm=cm_prediction(grid_search,X_test)
        return  grid_search,Accuracy,report,X_test,y_test,cm

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
dataset= pd.read_csv("Preprocessed_dataset.csv")
dataset1=pd.get_dummies(dataset,drop_first=True,dtype=int)

In [29]:
indep_x= dataset1.drop('Conversion',axis=1)
dep_y=dataset1['Conversion']

In [30]:
from sklearn.model_selection import GridSearchCV

rfelist=rfeFeature(indep_x,dep_y,12)
rfelist

Fitting GridSearchCV for LogisticRegression...
Fitting GridSearchCV for RandomForestClassifier...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting GridSearchCV for DecisionTreeClassifier...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting GridSearchCV for XGBClassifier...
Fitting 5 folds for each of 96 candidates, totalling 480 fits


[array([[0.04391851, 0.08803141, 2.39901653, ..., 0.        , 0.        ,
         0.        ],
        [0.15572507, 0.18272468, 2.91713775, ..., 0.        , 0.        ,
         0.        ],
        [0.27749037, 0.07642272, 8.2236191 , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.05652592, 0.13382612, 2.85324058, ..., 0.        , 1.        ,
         0.        ],
        [0.02396129, 0.13838618, 1.00296447, ..., 0.        , 1.        ,
         0.        ],
        [0.1856701 , 0.05722808, 6.96473936, ..., 0.        , 0.        ,
         0.        ]]),
 array([[1.36912000e+05, 6.49787007e+03, 4.39185107e-02, ...,
         9.00000000e+00, 4.00000000e+00, 6.88000000e+02],
        [4.17600000e+04, 3.89866861e+03, 1.55725071e-01, ...,
         7.00000000e+00, 2.00000000e+00, 3.45900000e+03],
        [8.84560000e+04, 1.54642960e+03, 2.77490369e-01, ...,
         2.00000000e+00, 8.00000000e+00, 2.33700000e+03],
        ...,
        [1.25471000e+05, 4.60953464e

In [25]:
#12
for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_y)   
    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train_resampled,y_train_resampled,X_test)
    print("Logistic Regression Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train_resampled,y_train_resampled,X_test)
    print("\n Decision Tree Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train_resampled,y_train_resampled,X_test)
    print("\n Random Forest Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Xgboost(X_train_resampled,y_train_resampled,X_test)
    print("\n XgboostClassification Report:")
    print(cm)
    print(report)
    

Logistic Regression Classification Report:
[[137  57]
 [428 978]]
              precision    recall  f1-score   support

           0       0.24      0.71      0.36       194
           1       0.94      0.70      0.80      1406

    accuracy                           0.70      1600
   macro avg       0.59      0.70      0.58      1600
weighted avg       0.86      0.70      0.75      1600

Fitting 5 folds for each of 18 candidates, totalling 90 fits

 Decision Tree Classification Report:
[[  63  131]
 [ 140 1266]]
              precision    recall  f1-score   support

           0       0.31      0.32      0.32       194
           1       0.91      0.90      0.90      1406

    accuracy                           0.83      1600
   macro avg       0.61      0.61      0.61      1600
weighted avg       0.83      0.83      0.83      1600

Fitting 5 folds for each of 27 candidates, totalling 135 fits

 Random Forest Classification Report:
[[  56  138]
 [  40 1366]]
              precision  

In [31]:
#12
for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_y)
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(sampling_strategy=1,random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train_resampled,y_train_resampled,X_test)
    print("Logistic Regression Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train_resampled,y_train_resampled,X_test)
    print("\n Decision Tree Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train_resampled,y_train_resampled,X_test)
    print("\n Random Forest Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Xgboost(X_train_resampled,y_train_resampled,X_test)
    print("\n XgboostClassification Report:")
    print(cm)
    print(report)

Logistic Regression Classification Report:
[[137  57]
 [425 981]]
              precision    recall  f1-score   support

           0       0.24      0.71      0.36       194
           1       0.95      0.70      0.80      1406

    accuracy                           0.70      1600
   macro avg       0.59      0.70      0.58      1600
weighted avg       0.86      0.70      0.75      1600

Fitting 5 folds for each of 18 candidates, totalling 90 fits

 Decision Tree Classification Report:
[[  63  131]
 [ 217 1189]]
              precision    recall  f1-score   support

           0       0.23      0.32      0.27       194
           1       0.90      0.85      0.87      1406

    accuracy                           0.78      1600
   macro avg       0.56      0.59      0.57      1600
weighted avg       0.82      0.78      0.80      1600

Fitting 5 folds for each of 27 candidates, totalling 135 fits

 Random Forest Classification Report:
[[  45  149]
 [  50 1356]]
              precision  

In [32]:
#12
for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_y)
    from imblearn.over_sampling import ADASYN
    adasyn = ADASYN(random_state=42)
    X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train_resampled,y_train_resampled,X_test)
    print("Logistic Regression Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train_resampled,y_train_resampled,X_test)
    print("\n Decision Tree Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train_resampled,y_train_resampled,X_test)
    print("\n Random Forest Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Xgboost(X_train_resampled,y_train_resampled,X_test)
    print("\n XgboostClassification Report:")
    print(cm)
    print(report)

Logistic Regression Classification Report:
[[141  53]
 [454 952]]
              precision    recall  f1-score   support

           0       0.24      0.73      0.36       194
           1       0.95      0.68      0.79      1406

    accuracy                           0.68      1600
   macro avg       0.59      0.70      0.57      1600
weighted avg       0.86      0.68      0.74      1600

Fitting 5 folds for each of 18 candidates, totalling 90 fits

 Decision Tree Classification Report:
[[  65  129]
 [ 176 1230]]
              precision    recall  f1-score   support

           0       0.27      0.34      0.30       194
           1       0.91      0.87      0.89      1406

    accuracy                           0.81      1600
   macro avg       0.59      0.60      0.59      1600
weighted avg       0.83      0.81      0.82      1600

Fitting 5 folds for each of 27 candidates, totalling 135 fits

 Random Forest Classification Report:
[[  43  151]
 [  40 1366]]
              precision  

In [33]:
#12
for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_y)
    from imblearn.over_sampling import BorderlineSMOTE
    smote_border = BorderlineSMOTE(sampling_strategy=0.75, random_state=42, kind='borderline-1')
    X_train_resampled, y_train_resampled = smote_border.fit_resample(X_train, y_train)

    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train_resampled,y_train_resampled,X_test)
    print("Logistic Regression Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train_resampled,y_train_resampled,X_test)
    print("\n Decision Tree Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train_resampled,y_train_resampled,X_test)
    print("\n Random Forest Classification Report:")
    print(cm)
    print(report)
    
    classifier,Accuracy,report,X_test,y_test,cm=Xgboost(X_train_resampled,y_train_resampled,X_test)
    print("\n XgboostClassification Report:")
    print(cm)
    print(report)

Logistic Regression Classification Report:
[[ 123   71]
 [ 276 1130]]
              precision    recall  f1-score   support

           0       0.31      0.63      0.41       194
           1       0.94      0.80      0.87      1406

    accuracy                           0.78      1600
   macro avg       0.62      0.72      0.64      1600
weighted avg       0.86      0.78      0.81      1600

Fitting 5 folds for each of 18 candidates, totalling 90 fits

 Decision Tree Classification Report:
[[  71  123]
 [ 177 1229]]
              precision    recall  f1-score   support

           0       0.29      0.37      0.32       194
           1       0.91      0.87      0.89      1406

    accuracy                           0.81      1600
   macro avg       0.60      0.62      0.61      1600
weighted avg       0.83      0.81      0.82      1600

Fitting 5 folds for each of 27 candidates, totalling 135 fits

 Random Forest Classification Report:
[[  42  152]
 [  42 1364]]
              precisi