In [1]:
import os

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

def analyze_param_importance(csv_path, target_col="val_mae", output_dir=None):
    df = pd.read_csv(csv_path)
    df = df.dropna()

    X = df.drop(columns=[target_col, "tag", "error"], errors="ignore")
    y = df[target_col]

    for col in X.columns:
        if X[col].dtype == "object":
            X[col] = LabelEncoder().fit_transform(X[col])

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)

    importances = model.feature_importances_
    correlations = X.corrwith(y)

    importance_df = pd.DataFrame({
        "hyperparameter": X.columns,
        "importance": importances,
        "correlation": correlations
    }).sort_values("importance", ascending=False)

    if output_dir is None:
        output_dir = os.path.dirname(csv_path)
    os.makedirs(output_dir, exist_ok=True)

    plt.figure(figsize=(8, 5))
    ax1 = plt.gca()
    importance_df.plot.barh(
        x="hyperparameter", y="importance", ax=ax1,
        color="steelblue", legend=False
    )
    ax1.set_xlabel("Random Forest Importance")
    ax1.set_title("Hyperparameter Importance")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "hparam_importance.png"))

    # out_csv = os.path.join(output_dir, "hparam_importance.csv")
    # importance_df.to_csv(out_csv, index=False)

    return importance_df