# Feature Importance Analysis
Este notebook carga la configuración de un experimento, entrena el modelo y grafica la importancia de los features promediada por seed.


## Imports


In [None]:
from dmeyf2025.modelling.train_model import train_models
from dmeyf2025.experiments.experiments import load_config
from dmeyf2025.etl import ETL
from dmeyf2025.processors.target_processor import BinaryTargetProcessor, CreateTargetProcessor
from dmeyf2025.processors.feature_processors import DeltaLagTransformer
import json
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Configuración de visualización
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_style('whitegrid')


## Configuración del Experimento


In [None]:
# Especificar la ruta del experimento que ya ha corrido
experiment_path = "experiments/DeltaLags2_0.1_50_01-04_G_0.0.3"

# Cargar configuración y parámetros
params = json.load(open(os.path.join(experiment_path, "best_params.json")))
config = load_config(os.path.join(experiment_path, "config.yaml"))

print(f"Experimento: {config['experiment']['name']}")
print(f"Descripción: {config['experiment']['description']}")
print(f"Seeds: {config['experiment']['seeds']}")
print(f"\nParámetros del modelo:")
for key, value in params.items():
    if key != 'seed':
        print(f"  {key}: {value}")


## Cargar Parámetros de la Configuración


In [None]:
train_months = config["data"]["train_months"]
eval_month = config["data"]["eval_month"]
positive_class = config["experiment"]["positive_classes"]
seeds = config["experiment"]["seeds"]
data_path = config["experiment"]["raw_data_path"]

# Remover seed de params si existe
params.pop("seed", None)

print(f"Meses de entrenamiento: {train_months}")
print(f"Mes de evaluación: {eval_month}")
print(f"Clases positivas: {positive_class}")
print(f"Número de seeds: {len(seeds)}")


## Procesamiento de Datos


In [None]:
print("Cargando datos...")
etl = ETL(data_path, CreateTargetProcessor(), train_months=train_months)
X, y, _, _, _, _ = etl.execute_complete_pipeline()

print(f"Shape inicial: {X.shape}")

# Procesar target
target_processor = BinaryTargetProcessor(positive_class)
X, y = target_processor.fit_transform(X, y)

print(f"Distribución de clases: {np.bincount(y)}")


## Aplicar Transformaciones de Features


In [None]:
# Aplicar transformación Delta-Lag
print("Aplicando transformación Delta-Lag...")
delta_lag_transformer = DeltaLagTransformer(n_deltas=2, n_lags=2)
X_transformed = delta_lag_transformer.fit_transform(X)

print(f"Shape después de transformación: {X_transformed.shape}")

# Preparar datos
X_transformed.set_index("numero_de_cliente", inplace=True)
X_transformed.loc[:, "label"] = y

# Separar train y eval
X_train = X_transformed[X_transformed["foto_mes"].isin(train_months)]
y_train = X_train["label"]
X_train = X_train.drop(columns=["label"])

X_eval = X_transformed[X_transformed["foto_mes"].isin([eval_month])]
X_eval = X_eval.drop(columns=["label"])

print(f"X_train shape: {X_train.shape}")
print(f"X_eval shape: {X_eval.shape}")


## Entrenamiento de Modelos


In [None]:
print("Entrenando modelos con diferentes seeds...")
preds, models = train_models(X_train, y_train, X_eval, params, seeds, experiment_path)

print(f"\nModelos entrenados: {len(models)}")
print(f"Predicciones shape: {preds.shape}")


## Análisis de Feature Importance


### Extraer Feature Importance de Todos los Modelos


In [None]:
# Crear DataFrame con importancias de todos los modelos
importances = pd.DataFrame()
importances["feature"] = X_train.columns

# Obtener importancia de cada modelo (por seed)
for n, (model, seed) in enumerate(zip(models, seeds)):
    importances[f"importance_seed_{seed}"] = model.feature_importance(importance_type="gain")

# Calcular estadísticas
importance_cols = [col for col in importances.columns if col.startswith("importance_")]
importances["importance_mean"] = importances[importance_cols].mean(axis=1)
importances["importance_std"] = importances[importance_cols].std(axis=1)
importances["importance_cv"] = importances["importance_std"] / (importances["importance_mean"] + 1e-10)

# Ordenar por importancia media
importances = importances.sort_values(by="importance_mean", ascending=False)

print("Top 20 features más importantes (promediados por seed):")
print(importances[["feature", "importance_mean", "importance_std", "importance_cv"]].head(20))


### Visualización: Top Features por Importancia Media


In [None]:
# Gráfico de barras con top N features
top_n = 30
top_features = importances.head(top_n)

plt.figure(figsize=(12, 10))
plt.barh(range(top_n), top_features["importance_mean"], xerr=top_features["importance_std"], alpha=0.7)
plt.yticks(range(top_n), top_features["feature"])
plt.xlabel("Importancia Media (Gain)", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.title(f"Top {top_n} Features por Importancia Media\n(Promediado sobre {len(seeds)} seeds)", fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()


### Visualización: Heatmap de Importancia por Seed


In [None]:
# Heatmap de importancia por seed para top features
top_n_heatmap = 20
top_features_heatmap = importances.head(top_n_heatmap)

# Preparar datos para heatmap
heatmap_data = top_features_heatmap[importance_cols].values
feature_names = top_features_heatmap["feature"].values

plt.figure(figsize=(14, 10))
sns.heatmap(heatmap_data, 
            yticklabels=feature_names,
            xticklabels=[f"Seed {seed}" for seed in seeds],
            cmap="YlOrRd",
            annot=False,
            fmt=".0f",
            cbar_kws={'label': 'Importancia (Gain)'})
plt.title(f"Heatmap de Importancia por Seed\nTop {top_n_heatmap} Features", fontsize=14, fontweight='bold')
plt.xlabel("Seed", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


### Visualización: Estabilidad de Features (Coeficiente de Variación)


In [None]:
# Scatter plot: Importancia media vs Coeficiente de variación
top_n_scatter = 50
top_features_scatter = importances.head(top_n_scatter)

plt.figure(figsize=(12, 8))
plt.scatter(top_features_scatter["importance_mean"], 
           top_features_scatter["importance_cv"],
           alpha=0.6, s=100)

# Etiquetar top 10 features
for idx in range(min(10, len(top_features_scatter))):
    row = top_features_scatter.iloc[idx]
    plt.annotate(row["feature"], 
                xy=(row["importance_mean"], row["importance_cv"]),
                xytext=(5, 5), textcoords='offset points',
                fontsize=9, alpha=0.7)

plt.xlabel("Importancia Media (Gain)", fontsize=12)
plt.ylabel("Coeficiente de Variación (CV)", fontsize=12)
plt.title(f"Estabilidad de Features\nImportancia Media vs Variabilidad entre Seeds", fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("\nFeatures más estables (menor CV):")
stable_features = importances.head(50).sort_values(by="importance_cv")
print(stable_features[["feature", "importance_mean", "importance_cv"]].head(10))


### Visualización: Distribución de Importancia por Seed


In [None]:
# Box plot para top features mostrando distribución entre seeds
top_n_box = 15
top_features_box = importances.head(top_n_box)

# Preparar datos
box_data = []
labels = []
for _, row in top_features_box.iterrows():
    box_data.append(row[importance_cols].values)
    labels.append(row["feature"])

plt.figure(figsize=(12, 10))
bp = plt.boxplot(box_data, vert=False, labels=labels, patch_artist=True)

# Colorear cajas
for patch in bp['boxes']:
    patch.set_facecolor('lightblue')
    patch.set_alpha(0.7)

plt.xlabel("Importancia (Gain)", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.title(f"Distribución de Importancia entre Seeds\nTop {top_n_box} Features", fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()


### Resumen de Importancia por Tipo de Feature


In [None]:
# Analizar tipos de features
def classify_feature(feature_name):
    """Clasificar feature según su nombre"""
    feature_name = str(feature_name).lower()
    
    if 'delta' in feature_name:
        return 'Delta'
    elif 'lag' in feature_name:
        return 'Lag'
    elif 'master' in feature_name or 'visa' in feature_name:
        return 'Tarjeta'
    elif any(word in feature_name for word in ['ctrx', 'mtransferencias', 'transfer']):
        return 'Transacciones'
    elif any(word in feature_name for word in ['caja', 'cuenta', 'saldo']):
        return 'Cuentas'
    elif 'foto_mes' in feature_name:
        return 'Temporal'
    else:
        return 'Otros'

importances["feature_type"] = importances["feature"].apply(classify_feature)

# Resumen por tipo
type_summary = importances.groupby("feature_type").agg({
    "importance_mean": ["sum", "mean", "count"],
    "importance_std": "mean"
}).round(2)

print("\nResumen de importancia por tipo de feature:")
print(type_summary)

# Gráfico de barras por tipo
type_importance = importances.groupby("feature_type")["importance_mean"].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
plt.bar(range(len(type_importance)), type_importance.values, alpha=0.7)
plt.xticks(range(len(type_importance)), type_importance.index, rotation=45, ha='right')
plt.xlabel("Tipo de Feature", fontsize=12)
plt.ylabel("Importancia Total (Gain)", fontsize=12)
plt.title("Importancia Total por Tipo de Feature", fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


### Guardar Resultados


In [None]:
# Guardar tabla de importancias
output_path = os.path.join(experiment_path, "feature_importance_analysis.csv")
importances.to_csv(output_path, index=False)
print(f"\nTabla de importancias guardada en: {output_path}")

# Guardar resumen
summary_path = os.path.join(experiment_path, "feature_importance_summary.txt")
with open(summary_path, "w") as f:
    f.write(f"Feature Importance Analysis\n")
    f.write(f"Experimento: {config['experiment']['name']}\n")
    f.write(f"Seeds utilizadas: {len(seeds)}\n\n")
    f.write(f"Top 20 Features:\n")
    f.write(importances[["feature", "importance_mean", "importance_std", "importance_cv"]].head(20).to_string())
    f.write(f"\n\nResumen por tipo de feature:\n")
    f.write(type_summary.to_string())

print(f"Resumen guardado en: {summary_path}")
