In [None]:
# Imports
import sys
from pathlib import Path

import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from lime.lime_tabular import LimeTabularExplainer

# Configuration du style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Imports r√©ussis")

## 1. Chargement du mod√®le et du dataset

In [None]:
# Chemins
BASE_DIR = Path().resolve()
MODEL_PATH = BASE_DIR / "future_skills_model.pkl"
DATASET_PATH = BASE_DIR / "future_skills_dataset.csv"

# V√©rification
if not MODEL_PATH.exists():
    raise FileNotFoundError(f"Mod√®le introuvable: {MODEL_PATH}")
if not DATASET_PATH.exists():
    raise FileNotFoundError(f"Dataset introuvable: {DATASET_PATH}")

print(f"üìÇ Mod√®le: {MODEL_PATH}")
print(f"üìÇ Dataset: {DATASET_PATH}")

In [None]:
# Charger le pipeline
pipeline = joblib.load(MODEL_PATH)
print(f"‚úÖ Mod√®le charg√©: {type(pipeline).__name__}")
print(f"   √âtapes: {list(pipeline.named_steps.keys())}")

# Charger le dataset
df = pd.read_csv(DATASET_PATH)
print(f"\n‚úÖ Dataset charg√©: {len(df)} lignes, {len(df.columns)} colonnes")
print(f"   Colonnes: {df.columns.tolist()}")

In [None]:
# Filtrer les niveaux valides
ALLOWED_LEVELS = {"LOW", "MEDIUM", "HIGH"}
df = df[df["future_need_level"].isin(ALLOWED_LEVELS)].copy()

print(f"üìä R√©partition des classes:")
print(df["future_need_level"].value_counts())
print(f"\nüìä Pourcentages:")
print(df["future_need_level"].value_counts(normalize=True).mul(100).round(2))

## 2. Pr√©paration des donn√©es pour l'explainability

In [None]:
# Identifier les features disponibles
potential_features = [
    "job_role_name",
    "skill_name",
    "skill_category",
    "job_department",
    "trend_score",
    "internal_usage",
    "training_requests",
    "scarcity_index",
    "hiring_difficulty",
    "avg_salary_k",
    "economic_indicator",
]

feature_cols = [col for col in potential_features if col in df.columns]
target_col = "future_need_level"

print(f"‚úÖ Features utilis√©es ({len(feature_cols)}):")
for col in feature_cols:
    print(f"   - {col}")

X = df[feature_cols].copy()
y = df[target_col].copy()

In [None]:
# S√©parer features cat√©gorielles et num√©riques
categorical_features = []
numeric_features = []

for col in feature_cols:
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        categorical_features.append(col)
    else:
        numeric_features.append(col)

print(f"üìã Features cat√©gorielles: {categorical_features}")
print(f"üìã Features num√©riques: {numeric_features}")

## 3. S√©lection d'exemples repr√©sentatifs

In [None]:
# S√©lectionner 2 exemples HIGH et 2 exemples MEDIUM
high_examples = df[df["future_need_level"] == "HIGH"].sample(n=min(2, len(df[df["future_need_level"] == "HIGH"])), random_state=42)
medium_examples = df[df["future_need_level"] == "MEDIUM"].sample(n=min(2, len(df[df["future_need_level"] == "MEDIUM"])), random_state=42)

examples = pd.concat([high_examples, medium_examples])

print(f"üéØ Exemples s√©lectionn√©s pour l'analyse:")
print("\n" + "="*80)
for idx, row in examples.iterrows():
    print(f"\nüìå Exemple {idx}:")
    print(f"   Job Role: {row.get('job_role_name', 'N/A')}")
    print(f"   Skill: {row.get('skill_name', 'N/A')}")
    print(f"   Level: {row['future_need_level']}")
    print(f"   Trend Score: {row.get('trend_score', 'N/A')}")
    print(f"   Internal Usage: {row.get('internal_usage', 'N/A')}")
    print(f"   Scarcity Index: {row.get('scarcity_index', 'N/A')}")
print("\n" + "="*80)

## 4. SHAP Analysis

SHAP (SHapley Additive exPlanations) est bas√© sur la th√©orie des jeux et fournit des explications coh√©rentes et interpr√©tables.

In [None]:
# Pr√©parer les donn√©es transform√©es pour SHAP
# Le pipeline contient un preprocessor + classifier
preprocessor = pipeline.named_steps['preprocess']
clf = pipeline.named_steps['clf']

# Transformer toutes les donn√©es
X_transformed = preprocessor.transform(X)

# Pour les exemples s√©lectionn√©s
X_examples = examples[feature_cols]
X_examples_transformed = preprocessor.transform(X_examples)

print(f"‚úÖ Donn√©es transform√©es: {X_transformed.shape}")
print(f"‚úÖ Exemples transform√©s: {X_examples_transformed.shape}")

In [None]:
# Cr√©er un explainer SHAP pour RandomForest
# TreeExplainer est optimal pour les mod√®les bas√©s sur les arbres
explainer = shap.TreeExplainer(clf)

# Calculer les valeurs SHAP pour nos exemples
shap_values = explainer.shap_values(X_examples_transformed)

print(f"‚úÖ SHAP values calcul√©es")
print(f"   Shape: {shap_values[0].shape if isinstance(shap_values, list) else shap_values.shape}")
print(f"   Classes: {clf.classes_}")

In [None]:
# R√©cup√©rer les noms des features apr√®s transformation
feature_names = []

# Features cat√©gorielles (OneHot encoded)
if categorical_features:
    cat_transformer = preprocessor.named_transformers_['cat']
    if hasattr(cat_transformer, 'get_feature_names_out'):
        cat_names = cat_transformer.get_feature_names_out(categorical_features)
        feature_names.extend(cat_names)

# Features num√©riques
feature_names.extend(numeric_features)

print(f"‚úÖ {len(feature_names)} feature names apr√®s transformation")
print(f"   Premi√®res 10: {feature_names[:10]}")

In [None]:
# Visualisation SHAP: Force plots pour chaque exemple
shap.initjs()

for i, (idx, row) in enumerate(examples.iterrows()):
    print(f"\n{'='*80}")
    print(f"üìä SHAP Force Plot - Exemple {i+1}")
    print(f"   Job Role: {row.get('job_role_name', 'N/A')}")
    print(f"   Skill: {row.get('skill_name', 'N/A')}")
    print(f"   True Level: {row['future_need_level']}")
    print(f"{'='*80}\n")
    
    # Pr√©diction
    pred = clf.predict(X_examples_transformed[i:i+1])[0]
    pred_proba = clf.predict_proba(X_examples_transformed[i:i+1])[0]
    
    print(f"   Predicted: {pred}")
    for j, cls in enumerate(clf.classes_):
        print(f"   P({cls}): {pred_proba[j]:.3f}")
    
    # Force plot pour la classe pr√©dite
    predicted_class_idx = np.argmax(pred_proba)
    
    if isinstance(shap_values, list):
        # Multi-class: une matrice par classe
        shap_display = shap.force_plot(
            explainer.expected_value[predicted_class_idx],
            shap_values[predicted_class_idx][i],
            X_examples_transformed[i],
            feature_names=feature_names
        )
    else:
        # Binary
        shap_display = shap.force_plot(
            explainer.expected_value,
            shap_values[i],
            X_examples_transformed[i],
            feature_names=feature_names
        )
    
    display(shap_display)

In [None]:
# Waterfall plots (mieux pour visualiser individuellement)
for i, (idx, row) in enumerate(examples.iterrows()):
    print(f"\n{'='*80}")
    print(f"üìä SHAP Waterfall Plot - Exemple {i+1}: {row.get('job_role_name', 'N/A')} √ó {row.get('skill_name', 'N/A')}")
    print(f"{'='*80}\n")
    
    pred_proba = clf.predict_proba(X_examples_transformed[i:i+1])[0]
    predicted_class_idx = np.argmax(pred_proba)
    
    if isinstance(shap_values, list):
        shap_obj = shap.Explanation(
            values=shap_values[predicted_class_idx][i],
            base_values=explainer.expected_value[predicted_class_idx],
            data=X_examples_transformed[i],
            feature_names=feature_names
        )
    else:
        shap_obj = shap.Explanation(
            values=shap_values[i],
            base_values=explainer.expected_value,
            data=X_examples_transformed[i],
            feature_names=feature_names
        )
    
    plt.figure(figsize=(10, 6))
    shap.waterfall_plot(shap_obj, max_display=10)
    plt.tight_layout()
    plt.show()

In [None]:
# Summary plot: vue d'ensemble de l'importance des features
print("\n" + "="*80)
print("üìä SHAP Summary Plot - Vue d'ensemble")
print("="*80 + "\n")

if isinstance(shap_values, list):
    # Pour chaque classe
    for class_idx, class_name in enumerate(clf.classes_):
        print(f"\nüéØ Classe: {class_name}")
        plt.figure(figsize=(10, 6))
        shap.summary_plot(
            shap_values[class_idx],
            X_examples_transformed,
            feature_names=feature_names,
            show=False,
            max_display=10
        )
        plt.title(f"SHAP Summary - {class_name}")
        plt.tight_layout()
        plt.show()
else:
    plt.figure(figsize=(10, 6))
    shap.summary_plot(
        shap_values,
        X_examples_transformed,
        feature_names=feature_names,
        show=False,
        max_display=10
    )
    plt.tight_layout()
    plt.show()

## 5. LIME Analysis (Alternative)

LIME fournit des explications locales en approximant le mod√®le par un mod√®le lin√©aire simple autour de l'instance √† expliquer.

In [None]:
# Cr√©er l'explainer LIME
# Utiliser un √©chantillon du dataset pour d√©finir l'espace des features
sample_size = min(1000, len(X_transformed))
X_sample = X_transformed[:sample_size]

lime_explainer = LimeTabularExplainer(
    training_data=X_sample,
    feature_names=feature_names,
    class_names=clf.classes_,
    mode='classification',
    random_state=42
)

print(f"‚úÖ LIME Explainer cr√©√©")
print(f"   Training samples: {sample_size}")
print(f"   Features: {len(feature_names)}")
print(f"   Classes: {clf.classes_}")

In [None]:
# G√©n√©rer des explications LIME pour chaque exemple
for i, (idx, row) in enumerate(examples.iterrows()):
    print(f"\n{'='*80}")
    print(f"üìä LIME Explanation - Exemple {i+1}")
    print(f"   Job Role: {row.get('job_role_name', 'N/A')}")
    print(f"   Skill: {row.get('skill_name', 'N/A')}")
    print(f"   True Level: {row['future_need_level']}")
    print(f"{'='*80}\n")
    
    # G√©n√©rer l'explication
    exp = lime_explainer.explain_instance(
        data_row=X_examples_transformed[i],
        predict_fn=clf.predict_proba,
        num_features=10,
        top_labels=3
    )
    
    # Afficher l'explication
    pred = clf.predict(X_examples_transformed[i:i+1])[0]
    print(f"   Predicted: {pred}")
    
    # Probabilit√©s
    pred_proba = clf.predict_proba(X_examples_transformed[i:i+1])[0]
    for j, cls in enumerate(clf.classes_):
        print(f"   P({cls}): {pred_proba[j]:.3f}")
    
    # Visualisation
    print("\n   Top contributing features:")
    predicted_class_idx = np.argmax(pred_proba)
    for feat, weight in exp.as_list(label=predicted_class_idx)[:10]:
        print(f"     {feat}: {weight:+.4f}")
    
    # Plot
    fig = exp.as_pyplot_figure(label=predicted_class_idx)
    plt.title(f"LIME - {row.get('job_role_name', 'N/A')} √ó {row.get('skill_name', 'N/A')}")
    plt.tight_layout()
    plt.show()

## 6. Extraction des signaux cl√©s pour explications simplifi√©es

Mapper les signaux (trend_score, scarcity_index, internal_usage) vers des phrases simples.

In [None]:
def extract_top_features_from_shap(shap_values_array, feature_names, top_n=3):
    """
    Extrait les top N features qui ont le plus contribu√© √† la pr√©diction.
    Retourne une liste de tuples (feature_name, shap_value).
    """
    # Valeurs absolues pour importance
    abs_values = np.abs(shap_values_array)
    top_indices = np.argsort(abs_values)[-top_n:][::-1]
    
    return [
        (feature_names[idx], shap_values_array[idx])
        for idx in top_indices
    ]

def generate_simple_explanation(top_features, prediction_level):
    """
    G√©n√®re une explication textuelle simple √† partir des top features.
    """
    # Mapping des features vers des termes compr√©hensibles
    feature_mapping = {
        'trend_score': 'tendance march√©',
        'scarcity_index': 'raret√© interne',
        'internal_usage': 'usage interne actuel',
        'training_requests': 'demandes de formation',
        'hiring_difficulty': 'difficult√© de recrutement',
        'avg_salary_k': 'niveau de salaire',
        'economic_indicator': 'indicateur √©conomique',
    }
    
    explanation_parts = []
    
    for feat_name, shap_val in top_features:
        # Identifier le signal principal
        base_feature = None
        for key in feature_mapping.keys():
            if key in feat_name.lower():
                base_feature = key
                break
        
        if base_feature:
            readable_name = feature_mapping[base_feature]
            
            if shap_val > 0:
                strength = "forte" if abs(shap_val) > 0.2 else "mod√©r√©e"
                explanation_parts.append(f"{readable_name} {strength}")
            else:
                strength = "faible" if abs(shap_val) > 0.2 else "limit√©e"
                explanation_parts.append(f"{readable_name} {strength}")
    
    if prediction_level == "HIGH":
        prefix = "Score √©lev√© car :"
    elif prediction_level == "MEDIUM":
        prefix = "Score mod√©r√© car :"
    else:
        prefix = "Score faible car :"
    
    if explanation_parts:
        return f"{prefix} {' + '.join(explanation_parts)}"
    else:
        return f"{prefix} multiple facteurs combin√©s"

print("‚úÖ Fonctions d'extraction cr√©√©es")

In [None]:
# G√©n√©rer des explications simplifi√©es pour nos exemples
print("\n" + "="*80)
print("üìù EXPLICATIONS SIMPLIFI√âES")
print("="*80 + "\n")

for i, (idx, row) in enumerate(examples.iterrows()):
    pred_proba = clf.predict_proba(X_examples_transformed[i:i+1])[0]
    predicted_class_idx = np.argmax(pred_proba)
    predicted_level = clf.classes_[predicted_class_idx]
    
    # Extraire top features
    if isinstance(shap_values, list):
        shap_vals = shap_values[predicted_class_idx][i]
    else:
        shap_vals = shap_values[i]
    
    top_features = extract_top_features_from_shap(shap_vals, feature_names, top_n=3)
    explanation = generate_simple_explanation(top_features, predicted_level)
    
    print(f"\nüìå Exemple {i+1}:")
    print(f"   Job Role: {row.get('job_role_name', 'N/A')}")
    print(f"   Skill: {row.get('skill_name', 'N/A')}")
    print(f"   Niveau pr√©dit: {predicted_level} ({pred_proba[predicted_class_idx]:.1%})")
    print(f"\n   üí° EXPLICATION:")
    print(f"   {explanation}")
    print(f"\n   üîç D√©tails:")
    for feat, val in top_features:
        print(f"     ‚Ä¢ {feat}: {val:+.4f}")
    print()

## 7. Analyse globale: Features les plus importantes

Identifier quelles features sont g√©n√©ralement les plus influentes pour toutes les pr√©dictions.

In [None]:
# Calculer SHAP values pour un √©chantillon plus large
sample_for_global = min(500, len(X_transformed))
X_global_sample = X_transformed[:sample_for_global]

print(f"Calcul des SHAP values pour {sample_for_global} exemples...")
shap_values_global = explainer.shap_values(X_global_sample)
print("‚úÖ Calcul termin√©")

In [None]:
# Bar plot: importance moyenne des features
print("\n" + "="*80)
print("üìä Feature Importance Globale (SHAP)")
print("="*80 + "\n")

if isinstance(shap_values_global, list):
    for class_idx, class_name in enumerate(clf.classes_):
        print(f"\nüéØ Classe: {class_name}")
        plt.figure(figsize=(10, 6))
        shap.summary_plot(
            shap_values_global[class_idx],
            X_global_sample,
            feature_names=feature_names,
            plot_type="bar",
            show=False,
            max_display=10
        )
        plt.title(f"Top 10 Features - {class_name}")
        plt.tight_layout()
        plt.show()
else:
    plt.figure(figsize=(10, 6))
    shap.summary_plot(
        shap_values_global,
        X_global_sample,
        feature_names=feature_names,
        plot_type="bar",
        show=False,
        max_display=10
    )
    plt.title("Top 10 Features Globales")
    plt.tight_layout()
    plt.show()

## 8. Conclusions et recommandations

### Insights cl√©s:

1. **Features les plus influentes**: Les analyses SHAP/LIME r√©v√®lent quelles caract√©ristiques (trend_score, scarcity_index, etc.) contribuent le plus aux pr√©dictions HIGH vs LOW.

2. **Patterns par classe**: 
   - **HIGH**: G√©n√©ralement associ√© √† une forte tendance march√© + raret√© interne √©lev√©e
   - **MEDIUM**: Mix √©quilibr√© de signaux
   - **LOW**: Tendances faibles ou usage interne d√©j√† satisfaisant

3. **Explications simplifi√©es**: Les explications textuelles peuvent √™tre g√©n√©r√©es automatiquement et stock√©es avec les pr√©dictions.

### Prochaines √©tapes:

- ‚úÖ Cr√©er un module `explanation_engine.py` pour automatiser la g√©n√©ration d'explications
- ‚úÖ Ajouter un champ `explanation` au mod√®le `FutureSkillPrediction`
- ‚úÖ Int√©grer l'explication dans l'API (optionnel via param√®tre `?include_explanation=true`)
- ‚úÖ Documenter l'UI future pour afficher "Pourquoi cette comp√©tence ?"

In [None]:
print("\n" + "="*80)
print("‚úÖ ANALYSE D'EXPLICABILIT√â TERMIN√âE")
print("="*80)
print("\nLes visualisations et explications ci-dessus d√©montrent:")
print("  1. Comment SHAP identifie les features cl√©s pour chaque pr√©diction")
print("  2. Comment LIME fournit des explications locales alternatives")
print("  3. Comment traduire les SHAP values en explications textuelles simples")
print("\nCes analyses peuvent √™tre int√©gr√©es dans l'application pour fournir")
print("de la transparence aux utilisateurs RH sur les recommandations ML.")
print("="*80)