## 1. Configuration

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import requests

# Modules du projet
from analyses.machine_learning import MachineLearningAnalysis
from analyses.data_cleaning import DataCleaning

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

API_BASE_URL = "http://localhost:8000"

## 2. Chargement et Préparation des Données

In [None]:
def load_and_prepare_data():
    """Charge et prépare les données pour le ML"""
    try:
        # Charger via API
        response = requests.get(f"{API_BASE_URL}/accidents?limit=10000")
        df = pd.DataFrame(response.json())
        print(f"Données chargées via API: {len(df)} accidents")
    except:
        # Fallback: CSV
        df = pd.read_csv('../data/clean/accidents_clean.csv')
        print(f"Données chargées depuis CSV: {len(df)} accidents")
    
    return df

df = load_and_prepare_data()

# Afficher les premières lignes
print(f"\nShape: {df.shape}")
df.head()

## 3. Définition de la Variable Cible

In [None]:
# Variable cible: gravité binaire (grave vs non-grave)
if 'grav' in df.columns:
    df['is_grave'] = (df['grav'] >= 3).astype(int)  # 3=Hospitalisé, 4=Tué
    target = 'is_grave'
    
    print("Distribution de la cible:")
    print(df[target].value_counts())
    print(f"\nProportion d'accidents graves: {df[target].mean()*100:.1f}%")
    
    # Visualisation
    plt.figure(figsize=(8, 5))
    df[target].value_counts().plot(kind='bar', color=['steelblue', 'darkred'])
    plt.title('Distribution de la Variable Cible')
    plt.xlabel('Accident Grave (0=Non, 1=Oui)')
    plt.ylabel('Nombre d\'accidents')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()
else:
    print("[ATTENTION] Colonne 'grav' non trouvée")
    target = None

## 4. Sélection des Features

In [None]:
# Features candidates (colonnes numériques)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features = [col for col in numeric_cols if col not in ['grav', 'is_grave', 'Num_Acc']]

print(f"Features sélectionnées ({len(features)}):")
for i, feat in enumerate(features, 1):
    print(f"  {i}. {feat}")

# Préparer X et y
X = df[features].fillna(0)  # Remplir les NaN
y = df[target]

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

## 5. Split Train/Test

In [None]:
# Split stratifié
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("=" * 60)
print("SPLIT TRAIN/TEST")
print("=" * 60)
print(f"Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nDistribution dans train: {y_train.value_counts().to_dict()}")
print(f"Distribution dans test: {y_test.value_counts().to_dict()}")

## 6. Entraînement du Modèle Random Forest

In [None]:
# Initialiser le module ML
ml = MachineLearningAnalysis(df)

# Entraîner Random Forest
print("Entraînement du modèle Random Forest...\n")
results = ml.random_forest_classifier(
    target_column=target,
    feature_columns=features,
    n_estimators=100,
    test_size=0.2
)

print("=" * 60)
print("RÉSULTATS DU MODÈLE")
print("=" * 60)
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"\nRapport de classification:")
print(results['classification_report'])

## 7. Feature Importance

In [None]:
# Analyser l'importance des features
feature_importance = results['feature_importance']
top_features = feature_importance.nlargest(10)

print("=" * 60)
print("TOP 10 FEATURES IMPORTANTES")
print("=" * 60)
for feature, importance in top_features.items():
    print(f"{feature:20s}: {importance:.4f}")

# Visualisation
plt.figure(figsize=(12, 6))
top_features.sort_values().plot(kind='barh', color='steelblue')
plt.title('Top 10 Features les Plus Importantes', fontsize=14, fontweight='bold')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## 8. Matrice de Confusion

In [None]:
# Matrice de confusion
cm = results['confusion_matrix']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Non Grave', 'Grave'],
            yticklabels=['Non Grave', 'Grave'])
plt.title('Matrice de Confusion', fontsize=14, fontweight='bold')
plt.ylabel('Vraie Classe')
plt.xlabel('Classe Prédite')
plt.tight_layout()
plt.show()

# Calcul des métriques depuis la matrice
tn, fp, fn, tp = cm.ravel()
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("\n=" * 60)
print("MÉTRIQUES DÉTAILLÉES")
print("=" * 60)
print(f"True Negatives:  {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives:  {tp}")
print(f"\nPrecision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

## 9. Prédictions sur Nouveaux Cas

In [None]:
# Exemple de prédiction sur données de test
model = results['model']

# Prendre 5 exemples du test set
sample_indices = np.random.choice(len(X_test), 5, replace=False)
X_sample = X_test.iloc[sample_indices]
y_sample = y_test.iloc[sample_indices]

# Prédictions
predictions = model.predict(X_sample)
probas = model.predict_proba(X_sample)

print("=" * 60)
print("EXEMPLES DE PRÉDICTIONS")
print("=" * 60)

for i, (pred, true, proba) in enumerate(zip(predictions, y_sample, probas), 1):
    status = "[CORRECT]" if pred == true else "[INCORRECT]"
    print(f"\nExemple {i}:")
    print(f"  Prédiction: {'Grave' if pred == 1 else 'Non Grave'} (confiance: {proba[pred]*100:.1f}%)")
    print(f"  Réalité:    {'Grave' if true == 1 else 'Non Grave'}")
    print(f"  Résultat:   {status}")

## 10. Sauvegarde du Modèle

In [None]:
import joblib
from pathlib import Path

# Créer dossier models s'il n'existe pas
models_dir = Path('../data/models')
models_dir.mkdir(parents=True, exist_ok=True)

# Sauvegarder le modèle
model_path = models_dir / 'random_forest_gravity.pkl'
joblib.dump(model, model_path)
print(f"Modèle sauvegardé: {model_path}")

# Sauvegarder les features utilisées
features_path = models_dir / 'features.txt'
with open(features_path, 'w') as f:
    f.write('\n'.join(features))
print(f"Features sauvegardées: {features_path}")

## 11. Résumé Final

In [None]:
print("=" * 60)
print("RÉSUMÉ DU MODÈLE ML")
print("=" * 60)
print(f"Modèle: Random Forest Classifier")
print(f"Features: {len(features)}")
print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Accuracy: {results['accuracy']*100:.2f}%")
print(f"\nTop 3 features:")
for i, (feat, imp) in enumerate(top_features.head(3).items(), 1):
    print(f"  {i}. {feat}: {imp:.4f}")
print("\nModélisation ML terminée!")
print("\nProchain notebook: 04_visualizations.ipynb")