In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, roc_auc_score
import matplotlib.pyplot as plt
import lightgbm as lgb

class PermutationFeatureSelector:
    def __init__(self, model, X_test, y_test, metric='rmse', n_repeats=30, random_state=None, threshold=0):
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.metric = metric
        self.n_repeats = n_repeats
        self.random_state = random_state
        self.threshold = threshold
        self.use_wrapper = self._should_use_wrapper()
        self.base_score = self._calculate_base_score()
        self.perm_importance = None
        if self.random_state is not None:
            np.random.seed(self.random_state)

    def _should_use_wrapper(self):
        return isinstance(self.model, lgb.Booster)

    class LGBMWrapper:
        def __init__(self, model):
            self.model = model

        def predict(self, X):
            return self.model.predict(X, num_iteration=self.model.best_iteration)

        def score(self, X, y, metric):
            preds = self.predict(X)
            if metric == 'rmse':
                return -np.sqrt(mean_squared_error(y, preds))
            elif metric == 'mae':
                return -mean_absolute_error(y, preds)
            elif metric == 'r2':
                return r2_score(y, preds)
            elif metric == 'mape':
                return -mean_absolute_percentage_error(y, preds)
            elif metric == 'auc':
                return roc_auc_score(y, preds)
            else:
                raise ValueError(f"Unsupported metric: {metric}")

    def _calculate_base_score(self):
        if self.use_wrapper:
            wrapped_model = self.LGBMWrapper(self.model)
            return wrapped_model.score(self.X_test, self.y_test, self.metric)
        else:
            preds = self.model.predict(self.X_test)
            if self.metric == 'rmse':
                return -np.sqrt(mean_squared_error(self.y_test, preds))
            elif self.metric == 'mae':
                return -mean_absolute_error(self.y_test, preds)
            elif self.metric == 'r2':
                return r2_score(self.y_test, preds)
            elif self.metric == 'mape':
                return -mean_absolute_percentage_error(self.y_test, preds)
            elif self.metric == 'auc':
                return roc_auc_score(self.y_test, preds)
            else:
                raise ValueError(f"Unsupported metric: {self.metric}")

    def calculate_permutation_importance(self):
        if self.use_wrapper:
            wrapped_model = self.LGBMWrapper(self.model)
        feature_importances = np.zeros(self.X_test.shape[1])

        for col in range(self.X_test.shape[1]):
            scores = np.zeros(self.n_repeats)
            for n in range(self.n_repeats):
                X_permuted = self.X_test.copy()
                X_permuted.iloc[:, col] = np.random.permutation(X_permuted.iloc[:, col])
                if self.use_wrapper:
                    permuted_score = wrapped_model.score(X_permuted.values, self.y_test, self.metric)
                else:
                    permuted_preds = self.model.predict(X_permuted)
                    if self.metric == 'rmse':
                        permuted_score = -np.sqrt(mean_squared_error(self.y_test, permuted_preds))
                    elif self.metric == 'mae':
                        permuted_score = -mean_absolute_error(self.y_test, permuted_preds)
                    elif self.metric == 'r2':
                        permuted_score = r2_score(self.y_test, permuted_preds)
                    elif self.metric == 'mape':
                        permuted_score = -mean_absolute_percentage_error(self.y_test, permuted_preds)
                    elif self.metric == 'auc':
                        permuted_score = roc_auc_score(self.y_test, permuted_preds)
                    else:
                        raise ValueError(f"Unsupported metric: {self.metric}")
                scores[n] = permuted_score
            feature_importances[col] = self.base_score - np.mean(scores)

        self.perm_importance = feature_importances
        return feature_importances

    def plot_permutation_importance(self):
        if self.perm_importance is None:
            perm_importance = self.calculate_permutation_importance()
        else:
            perm_importance = self.perm_importance
        perm_importance_df = pd.DataFrame({
            'Feature': self.X_test.columns,
            'Importance': perm_importance
        }).sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(10, 8))
        colors = perm_importance_df['Importance'].apply(lambda x: 'red' if x < 0 else ('blue' if x > 0 else 'gray'))
        plt.barh(perm_importance_df['Feature'], perm_importance_df['Importance'], color=colors)
        plt.xlabel('Mean Accuracy Decrease')
        plt.ylabel('Feature')
        plt.title('Permutation Importance')
        plt.gca().invert_yaxis()
        plt.show()

    def choose_feat(self):
        if self.perm_importance is None:
            perm_importance = self.calculate_permutation_importance()
        else:
            perm_importance = self.perm_importance

        chosen_features = self.X_test.columns[perm_importance > self.threshold].tolist()
        chosen_features_df = pd.DataFrame({
            'Feature': self.X_test.columns[perm_importance > self.threshold],
            'Importance': perm_importance[perm_importance > self.threshold]
        }).sort_values(by='Importance', ascending=False)

        return chosen_features, chosen_features_df
    

permutation_importance = PermutationFeatureSelector(model, X_test, y_test, metric='rmse', random_state=42,threshold=1)
permutation_importance.plot_permutation_importance()

selected_features, selected_features_df = permutation_importance.choose_feat()
display(len(selected_features))
display(selected_features_df)
     