In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.base import is_classifier
class FeatureSelector:
    def __init__(self, model, cv=5, scoring='accuracy'):
        self.model = model
        self.cv = cv
        self.scoring = scoring
        self.rfecv = None

    def fit_transform(self, X, y):
        self.rfecv = RFECV(estimator=self.model, step=1, cv=self.cv, scoring=self.scoring)
        self.rfecv.fit(X, y)
        return self.transform(X)

    def transform(self, X):
        if self.rfecv is None:
            raise ValueError("The model has not been fitted. Call fit_transform method first.")

        return X[:, self.rfecv.support_]

    def get_selected_features(self, feature_names):
        if self.rfecv is None:
            raise ValueError("The model has not been fitted. Call fit_transform method first.")

        return [feature_names[i] for i, is_selected in enumerate(self.rfecv.support_) if is_selected]

    def get_feature_ranking(self, feature_names):
        if self.rfecv is None:
            raise ValueError("The model has not been fitted. Call fit_transform method first.")

        ranking = self.rfecv.ranking_
        return pd.Series(ranking, index=feature_names)

    def get_selected_feature_indices(self):
        if self.rfecv is None:
            raise ValueError("The model has not been fitted. Call fit_transform method first.")

        return np.where(self.rfecv.support_)[0]

    def get_num_selected_features(self):
        if self.rfecv is None:
            raise ValueError("The model has not been fitted. Call fit_transform method first.")

        return self.rfecv.n_features_
        
        
    def remove_features(self, X, model_name, importance_threshold=0.01):
        if self.rfecv is None:
            raise ValueError("The model has not been fitted. Call fit_transform method first.")

        # Check if importance_threshold is valid
        if not 0 <= importance_threshold <= 1:
            raise ValueError("importance_threshold should be a value between 0 and 1.")

        print(f"--------------------")

        print(f"Model : {model_name}")

        # Get the names of features to be removed
        feature_names = self.get_selected_features(X.columns)

        # Get the feature importances
        feature_importances = self.rfecv.estimator_.feature_importances_

        # Determine the number of features to remove based on the threshold
        num_features_to_remove = np.sum(feature_importances < importance_threshold)

        # If no features need to be removed, return the original X
        if num_features_to_remove == 0:
            print("No features need to be removed.")
            return X

        # Print the feature importances
        print("Feature Importances:")
        for feature_name, importance in zip(feature_names, feature_importances):
            print(f"{feature_name}: {importance}")

        # Get the feature indices to be removed
        feature_indices_to_remove = np.argsort(feature_importances)[:num_features_to_remove]

        # Print the features to be removed
        print("                 ----------------------------                       ")
        print("Features to be removed:")
        for idx in feature_indices_to_remove:
            print(f"  - {feature_names[idx]}")


models=ModelClassifier()


for model_name, model in models.model_dict.items():
    
    if model_name==best_model:
        
        # Create an instance of FeatureSelector for each model
        feature_selector = FeatureSelector(cv=5, scoring='accuracy',model=model)
        
        # Perform feature selection with RFECV for each model
        X=X_train.values
        y=y_train.values
        
        X_train_selected = feature_selector.fit_transform(X, y)
        
        x_val=X_test.values
        X_val_selected = feature_selector.transform(x_val)
        
        # Get the selected feature indices
        selected_feature_indices = feature_selector.get_selected_feature_indices()
        # Print the selected feature indices before removal
        print("Selected feature indices before removal:", selected_feature_indices)
        
        feature_selector.remove_features(X=X_train,model_name=model_name)


## Optuna catboost without split


In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

class OptunaTuner_Catboost:
    def __init__(self, model, params, X, y):
        self.model = model
        self.params = params
        self.X = X
        self.y = y

    def Objective(self, trial):
        param_values = {}
        for key, value_range in self.params.items():
            if value_range[0] <= value_range[1]:
                if isinstance(value_range[0], int) and isinstance(value_range[1], int):
                    param_values[key] = trial.suggest_int(key, value_range[0], value_range[1])
                else:
                    param_values[key] = trial.suggest_float(key, value_range[0], value_range[1])
            else:
                raise ValueError(f"Invalid range for {key}: low={value_range[0]}, high={value_range[1]}")

        model = CatBoostClassifier(**param_values)

        # Evaluate the model using AUC-ROC
        model.fit(self.X, self.y)  # Fit the model on the data
        y_probs = model.predict_proba(self.X)[:, 1]  # Get predicted probabilities for the positive class
        auc_roc = roc_auc_score(self.y, y_probs)
        return auc_roc

    def tune(self, n_trials=100):
        study = optuna.create_study(direction="maximize")  # maximize AUC-ROC
        study.optimize(self.Objective, n_trials=n_trials)

        best_params = study.best_params
        print(f"Best parameters: {best_params}")

        # Create a new CatBoost model instance with the best parameters
        best_model = CatBoostClassifier(**best_params)
        best_model.fit(self.X, self.y)  # Fit the best model on the data

        best_auc_score = study.best_value
        print(f"Best AUC Score: {best_auc_score}")

        # Here, we return both the tuned model and the best AUC-ROC score
        return best_auc_score, best_model

### Optuna with Splits


In [None]:
import optuna
from sklearn.metrics import roc_auc_score

class OptunaTuner:
    def __init__(self, model, params, X, y):
        self.model = model
        self.params = params
        self.X = X
        self.y = y

    def Objective(self, trial):
        param_values = {}
        for key, value_range in self.params.items():
            if value_range[0] <= value_range[1]:
                if isinstance(value_range[0], int) and isinstance(value_range[1], int):
                    param_values[key] = trial.suggest_int(key, value_range[0], value_range[1])
                else:
                    param_values[key] = trial.suggest_float(key, value_range[0], value_range[1])
            else:
                raise ValueError(f"Invalid range for {key}: low={value_range[0]}, high={value_range[1]}")

        self.model.set_params(**param_values)

        # Evaluate the model using AUC-ROC
        self.model.fit(self.X, self.y)  # Fit the model on the data
        y_probs = self.model.predict_proba(self.X)[:, 1]  # Get predicted probabilities for the positive class
        auc_roc = roc_auc_score(self.y, y_probs)
        return auc_roc

    def tune(self, n_trials=100):
        study = optuna.create_study(direction="maximize")  # maximize AUC-ROC
        study.optimize(self.Objective, n_trials=n_trials)

        best_params = study.best_params
        print(f"Best parameters: {best_params}")
        self.model.set_params(**best_params)

        print(f"Best AUC Score: {study.best_value}")

        # Here, we return both the tuned model and the best AUC-ROC score
        return study.best_value, self.model
