# Compute Feature (Permutation) Importance 

In [None]:
from next_assessment_dynamic_preprocessing import targets  
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
import pickle


X = pd.read_pickle('data/cached_X_imputed.pckl')
y = pd.read_pickle('data/cached_y.pckl')

# Train and evaluate the model for each target variable
for target in targets:
    if target not in y.columns:
        print(f"Target '{target}' not found in y. Skipping...")
        continue

    # Load the model from the file
    with open(f'calibrated_model_{target}.pkl', 'rb') as file:
        model = pickle.load(file)
    
    perm_importances = permutation_importance(
        model, X, y[target], scoring='roc_auc', n_repeats=10, random_state=42,  n_jobs=4
    )

    # Create a DataFrame for feature importances
    feature_importances_df = pd.DataFrame({
        'Feature': X.columns.tolist(),
        'Importance': perm_importances['importances_mean']
    }).sort_values(by='Importance', ascending=False)
    feature_importances_df.to_json(f'data/{target}-importances.json')

# Plot calculated importances

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob 
import os

json_folder = "data"
output_folder = None  # Set to None to display plots instead of saving

for json_file in glob(f'{json_folder}/*-importances.json'):
    # extracts the target from the file name
    target = os.path.splitext(os.path.basename(json_file))[0][:-12]
    feature_importances = pd.read_json(json_file)
    # Sort by importance and get the top 20
    top_features = feature_importances.nlargest(20, 'Importance')

    # Plot the top 20 features
    plt.figure(figsize=(10, 6))
    plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
    plt.gca().invert_yaxis()  # Invert y-axis for descending order
    plt.title(f"Top 20 Features - {target}", fontsize=14)
    plt.xlabel("Feature Importance", fontsize=12)
    plt.ylabel("Features", fontsize=12)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.xlim(0, 0.15)  # Set x-axis limit

    # Save or show the plot
    if output_folder:
        os.makedirs(output_folder, exist_ok=True)
        plot_path = os.path.join(output_folder, f"{target}_top_features.png")
        plt.savefig(plot_path, bbox_inches='tight')
    else:
        plt.show()

    plt.close()