<a href="https://colab.research.google.com/github/Maria-lin/How-to-make-notebook-in-dataiku/blob/main/Detection_Anomalies_DAB_Professionnel_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üè¶ D√©tection d'Anomalies sur les Distributeurs Automatiques (DAB)
## Analyse Senior - Approche Multi-Algorithmes

---

### üìã Objectifs
- D√©tecter comportements anormaux des DAB (fraude, dysfonctionnement, usage atypique)
- Approche non-supervis√©e avec validation crois√©e multi-algorithmes
- Interpr√©tabilit√© et recommandations actionnables

### üéØ M√©thodologie
1. Audit qualit√© et EDA approfondie
2. Feature engineering m√©tier
3. D√©tection multi-algorithmes (IF + LOF + One-Class SVM)
4. Consensus et scoring
5. Interpr√©tabilit√© et validation
6. Export et recommandations

In [None]:
# ============================================================================
# IMPORTS ET CONFIGURATION
# ============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime

# ML & Stats
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from scipy import stats

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("="*80)
print(" "*25 + "‚úÖ ENVIRONNEMENT CONFIGUR√â")
print("="*80)
print(f"üìÖ Date : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üé≤ Random State : {RANDOM_STATE}")

---
# üì• SECTION 1 : Chargement des Donn√©es

‚ö†Ô∏è **ACTION REQUISE** : Modifiez `DATASET_NAME` avec le nom de votre dataset Dataiku

In [None]:
import dataiku

# üîß PARAM√àTRE √Ä CONFIGURER
DATASET_NAME = "VOTRE_DATASET_DAB"  # ‚ö†Ô∏è MODIFIER ICI

try:
    dataset = dataiku.Dataset(DATASET_NAME)
    df = dataset.get_dataframe()

    print(f"‚úÖ Dataset charg√© : {DATASET_NAME}")
    print(f"üìä Dimensions : {df.shape[0]:,} lignes √ó {df.shape[1]} colonnes")
    print(f"üíæ M√©moire : {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n")

    display(df.head(10))

except Exception as e:
    print(f"‚ùå Erreur : {e}")
    print("\nüí° V√©rifiez :")
    print("   1. Le nom du dataset est correct")
    print("   2. Vous avez les permissions")
    raise

---
# üîç SECTION 2 : Audit Qualit√© Complet

In [None]:
print("="*80)
print(" "*25 + "üìä AUDIT QUALIT√â")
print("="*80)

# 2.1 Structure
print("\nüîé Informations g√©n√©rales :")
df.info()

# 2.2 Valeurs manquantes
print("\nüï≥Ô∏è Valeurs manquantes :")
missing = pd.DataFrame({
    'nb_missing': df.isnull().sum(),
    'pct_missing': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('pct_missing', ascending=False)
missing = missing[missing['nb_missing'] > 0]

if len(missing) > 0:
    display(missing)
else:
    print("‚úÖ Aucune valeur manquante")

# 2.3 Doublons
dup_rows = df.duplicated().sum()
print(f"\nüîÑ Doublons de lignes : {dup_rows:,}")
if 'num_automate' in df.columns:
    dup_id = df.duplicated(subset=['num_automate']).sum()
    print(f"üîÑ Doublons num_automate : {dup_id:,}")

# 2.4 Incoh√©rences
print("\n‚öôÔ∏è Contr√¥les de coh√©rence :")
checks = {}

if 'montant_total' in df.columns:
    checks['montant_negatif'] = (df['montant_total'] < 0).sum()
    checks['montant_zero'] = (df['montant_total'] == 0).sum()

if 'nb_total_de_retraits' in df.columns:
    checks['nb_negatif'] = (df['nb_total_de_retraits'] < 0).sum()
    checks['nb_zero'] = (df['nb_total_de_retraits'] == 0).sum()

if 'montant_total' in df.columns and 'nb_total_de_retraits' in df.columns:
    checks['montant_pos_nb_zero'] = ((df['montant_total'] > 0) & (df['nb_total_de_retraits'] == 0)).sum()
    checks['montant_zero_nb_pos'] = ((df['montant_total'] == 0) & (df['nb_total_de_retraits'] > 0)).sum()

checks_df = pd.DataFrame.from_dict(checks, orient='index', columns=['Nombre'])
checks_df['Pourcentage'] = (checks_df['Nombre'] / len(df) * 100).round(2)
display(checks_df.sort_values('Nombre', ascending=False))

---
# üìä SECTION 3 : EDA - Analyse Exploratoire

In [None]:
# 3.1 Variables cat√©gorielles
cat_cols = [c for c in ['lib_site_implementation','type_carte','dab_hos_site','typ_gab'] if c in df.columns]

if cat_cols:
    print("üè∑Ô∏è Variables cat√©gorielles :\n")
    for col in cat_cols:
        print(f"\n{col} - {df[col].nunique()} modalit√©s")
        display(df[col].value_counts().head(10))

In [None]:
# 3.2 Variables num√©riques
num_cols = [c for c in ['montant_total','nb_total_de_retraits'] if c in df.columns]

if num_cols:
    print("üî¢ Statistiques num√©riques :\n")
    stats_df = df[num_cols].describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T
    stats_df['skewness'] = df[num_cols].skew()
    stats_df['kurtosis'] = df[num_cols].kurtosis()
    display(stats_df)

    # Visualisations
    fig, axes = plt.subplots(len(num_cols), 3, figsize=(15, 5*len(num_cols)))
    if len(num_cols) == 1:
        axes = axes.reshape(1, -1)

    for idx, col in enumerate(num_cols):
        data = df[col].dropna()

        # Histogramme
        axes[idx,0].hist(data, bins=50, edgecolor='black', alpha=0.7)
        axes[idx,0].axvline(data.median(), color='red', linestyle='--', label=f'M√©diane: {data.median():,.0f}')
        axes[idx,0].set_title(f'Distribution - {col}')
        axes[idx,0].legend()

        # Boxplot
        axes[idx,1].boxplot(data, vert=True)
        axes[idx,1].set_title(f'Boxplot - {col}')

        # Q-Q plot
        stats.probplot(data, dist="norm", plot=axes[idx,2])
        axes[idx,2].set_title(f'Q-Q Plot - {col}')

    plt.tight_layout()
    plt.show()

In [None]:
# 3.3 Relation montant vs nombre de retraits
if 'montant_total' in df.columns and 'nb_total_de_retraits' in df.columns:
    plt.figure(figsize=(10, 6))

    if 'dab_hos_site' in df.columns:
        for cat in df['dab_hos_site'].dropna().unique():
            mask = df['dab_hos_site'] == cat
            plt.scatter(df.loc[mask, 'nb_total_de_retraits'],
                       df.loc[mask, 'montant_total'],
                       alpha=0.5, label=cat)
        plt.legend(title='Type DAB')
    else:
        plt.scatter(df['nb_total_de_retraits'], df['montant_total'], alpha=0.5)

    plt.xlabel('Nombre de retraits')
    plt.ylabel('Montant total (‚Ç¨)')
    plt.title('Relation Montant - Volume')
    plt.grid(alpha=0.3)
    plt.show()

    corr = df[['montant_total','nb_total_de_retraits']].corr().iloc[0,1]
    print(f"\nüìä Corr√©lation : {corr:.3f}")

---
# üî® SECTION 4 : Feature Engineering M√©tier

In [None]:
print("="*80)
print(" "*20 + "üî® CR√âATION DE FEATURES M√âTIER")
print("="*80)

df_enriched = df.copy()
features_created = []

# Feature 1: Montant moyen par retrait
if 'montant_total' in df.columns and 'nb_total_de_retraits' in df.columns:
    df_enriched['montant_moyen_par_retrait'] = np.where(
        df_enriched['nb_total_de_retraits'] > 0,
        df_enriched['montant_total'] / df_enriched['nb_total_de_retraits'],
        np.nan
    )
    features_created.append('montant_moyen_par_retrait')
    print("‚úÖ montant_moyen_par_retrait = montant_total / nb_retraits")

# Feature 2: Ratio vs m√©diane (montant)
if 'montant_total' in df.columns:
    median_mt = df_enriched['montant_total'].median()
    df_enriched['ratio_montant_vs_median'] = df_enriched['montant_total'] / median_mt
    features_created.append('ratio_montant_vs_median')
    print(f"‚úÖ ratio_montant_vs_median = montant / m√©diane ({median_mt:,.2f})")

# Feature 3: Ratio vs m√©diane (nombre)
if 'nb_total_de_retraits' in df.columns:
    median_nb = df_enriched['nb_total_de_retraits'].median()
    df_enriched['ratio_nb_vs_median'] = df_enriched['nb_total_de_retraits'] / median_nb
    features_created.append('ratio_nb_vs_median')
    print(f"‚úÖ ratio_nb_vs_median = nb_retraits / m√©diane ({median_nb:,.0f})")

# Feature 4: Montant attendu vs observ√©
if 'montant_moyen_par_retrait' in df_enriched.columns:
    median_moy = df_enriched['montant_moyen_par_retrait'].median()
    df_enriched['montant_attendu'] = df_enriched['nb_total_de_retraits'] * median_moy
    df_enriched['ratio_observe_vs_attendu'] = np.where(
        df_enriched['montant_attendu'] > 0,
        df_enriched['montant_total'] / df_enriched['montant_attendu'],
        np.nan
    )
    features_created.extend(['montant_attendu', 'ratio_observe_vs_attendu'])
    print("‚úÖ ratio_observe_vs_attendu = montant_observ√© / montant_attendu")

# Feature 5-6: Transformations log
if 'montant_total' in df.columns:
    df_enriched['log_montant'] = np.log1p(df_enriched['montant_total'])
    features_created.append('log_montant')
    print("‚úÖ log_montant = log(1 + montant)")

if 'nb_total_de_retraits' in df.columns:
    df_enriched['log_nb_retraits'] = np.log1p(df_enriched['nb_total_de_retraits'])
    features_created.append('log_nb_retraits')
    print("‚úÖ log_nb_retraits = log(1 + nb_retraits)")

# Feature 7: Z-score
if 'montant_total' in df.columns:
    mean_mt = df_enriched['montant_total'].mean()
    std_mt = df_enriched['montant_total'].std()
    if std_mt > 0:
        df_enriched['zscore_montant'] = (df_enriched['montant_total'] - mean_mt) / std_mt
        features_created.append('zscore_montant')
        print("‚úÖ zscore_montant = (montant - moyenne) / std")

# Feature 8-10: Indicateurs binaires
if 'nb_total_de_retraits' in df.columns:
    p10 = df_enriched['nb_total_de_retraits'].quantile(0.1)
    p90 = df_enriched['nb_total_de_retraits'].quantile(0.9)
    df_enriched['is_low_activity'] = (df_enriched['nb_total_de_retraits'] <= p10).astype(int)
    df_enriched['is_high_activity'] = (df_enriched['nb_total_de_retraits'] >= p90).astype(int)
    features_created.extend(['is_low_activity', 'is_high_activity'])
    print(f"‚úÖ is_low_activity (P10={p10:.0f}), is_high_activity (P90={p90:.0f})")

if 'dab_hos_site' in df.columns:
    df_enriched['is_hors_site'] = (df_enriched['dab_hos_site'] == 'H').astype(int)
    features_created.append('is_hors_site')
    print("‚úÖ is_hors_site = 1 si hors site, 0 sinon")

print(f"\nüìä Total : {len(features_created)} features cr√©√©es")

if features_created:
    print("\nüìà Statistiques des nouvelles features :")
    display(df_enriched[features_created].describe(percentiles=[.05,.25,.5,.75,.95]).T)

---
# ‚öñÔ∏è SECTION 5 : Pr√©paration et Normalisation

In [None]:
# S√©lection des features pour la mod√©lisation
model_features = [
    'montant_total',
    'nb_total_de_retraits',
    'montant_moyen_par_retrait',
    'ratio_montant_vs_median',
    'ratio_nb_vs_median',
    'ratio_observe_vs_attendu',
    'log_montant',
    'log_nb_retraits',
    'zscore_montant',
    'is_low_activity',
    'is_high_activity',
    'is_hors_site'
]
model_features = [f for f in model_features if f in df_enriched.columns]

print(f"üìã Features s√©lectionn√©es ({len(model_features)}) :")
for i, f in enumerate(model_features, 1):
    print(f"  {i:2d}. {f}")

# Dataframe de mod√©lisation
df_model = df_enriched[model_features].copy()

# Nettoyage
df_model.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in df_model.select_dtypes(include=[np.number]).columns:
    if df_model[col].isnull().any():
        df_model[col].fillna(df_model[col].median(), inplace=True)

print(f"\n‚úÖ Dataset pr√™t : {df_model.shape[0]:,} √ó {df_model.shape[1]}")
print(f"üíæ M√©moire : {df_model.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Normalisation avec RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(df_model)
df_scaled = pd.DataFrame(X_scaled, columns=df_model.columns, index=df_model.index)

print(f"\n‚öñÔ∏è Normalisation effectu√©e (RobustScaler - robuste aux outliers)")

---
# ü§ñ SECTION 6 : D√©tection d'Anomalies Multi-Algorithmes

## 6.1 Isolation Forest

In [None]:
print("="*80)
print(" "*25 + "üå≤ ISOLATION FOREST")
print("="*80)

# Param√®tres
CONTAMINATION = 0.05  # 5% d'anomalies attendues

# Mod√®le
iso_forest = IsolationForest(
    n_estimators=500,
    contamination=CONTAMINATION,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# Entra√Ænement et pr√©diction
predictions_if = iso_forest.fit_predict(df_scaled)
scores_if = -iso_forest.score_samples(df_scaled)

# R√©sultats
df_enriched['if_anomaly'] = (predictions_if == -1).astype(int)
df_enriched['if_score'] = scores_if

n_anomalies_if = df_enriched['if_anomaly'].sum()
pct_if = (n_anomalies_if / len(df_enriched) * 100).round(2)

print(f"\n‚úÖ D√©tection termin√©e")
print(f"üî¥ Anomalies : {n_anomalies_if:,} ({pct_if}%)")
print(f"üü¢ Normaux   : {len(df_enriched) - n_anomalies_if:,} ({100-pct_if}%)")

## 6.2 Local Outlier Factor (LOF)

In [None]:
print("="*80)
print(" "*25 + "üìç LOCAL OUTLIER FACTOR")
print("="*80)

# Mod√®le LOF
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=CONTAMINATION,
    n_jobs=-1
)

# Pr√©diction
predictions_lof = lof.fit_predict(df_scaled)
scores_lof = -lof.negative_outlier_factor_

# R√©sultats
df_enriched['lof_anomaly'] = (predictions_lof == -1).astype(int)
df_enriched['lof_score'] = scores_lof

n_anomalies_lof = df_enriched['lof_anomaly'].sum()
pct_lof = (n_anomalies_lof / len(df_enriched) * 100).round(2)

print(f"\n‚úÖ D√©tection termin√©e")
print(f"üî¥ Anomalies : {n_anomalies_lof:,} ({pct_lof}%)")
print(f"üü¢ Normaux   : {len(df_enriched) - n_anomalies_lof:,} ({100-pct_lof}%)")

## 6.3 One-Class SVM

In [None]:
print("="*80)
print(" "*25 + "üéØ ONE-CLASS SVM")
print("="*80)

# Mod√®le One-Class SVM
oc_svm = OneClassSVM(
    nu=CONTAMINATION,  # nu ~ contamination
    kernel='rbf',
    gamma='auto'
)

# Pr√©diction
predictions_svm = oc_svm.fit_predict(df_scaled)
scores_svm = -oc_svm.score_samples(df_scaled)

# R√©sultats
df_enriched['svm_anomaly'] = (predictions_svm == -1).astype(int)
df_enriched['svm_score'] = scores_svm

n_anomalies_svm = df_enriched['svm_anomaly'].sum()
pct_svm = (n_anomalies_svm / len(df_enriched) * 100).round(2)

print(f"\n‚úÖ D√©tection termin√©e")
print(f"üî¥ Anomalies : {n_anomalies_svm:,} ({pct_svm}%)")
print(f"üü¢ Normaux   : {len(df_enriched) - n_anomalies_svm:,} ({100-pct_svm}%)")

## 6.4 Consensus Multi-Algorithmes

In [None]:
print("="*80)
print(" "*22 + "üéØ CONSENSUS MULTI-ALGORITHMES")
print("="*80)

# Comptage du nombre d'algorithmes d√©tectant une anomalie
df_enriched['n_algos_detecting'] = (
    df_enriched['if_anomaly'] +
    df_enriched['lof_anomaly'] +
    df_enriched['svm_anomaly']
)

# Cat√©gorisation
df_enriched['anomaly_level'] = df_enriched['n_algos_detecting'].map({
    0: 'Normal',
    1: 'Anomalie_Faible',
    2: 'Anomalie_Mod√©r√©e',
    3: 'Anomalie_Forte'
})

# Score composite (moyenne des scores normalis√©s)
if_norm = (df_enriched['if_score'] - df_enriched['if_score'].min()) / (df_enriched['if_score'].max() - df_enriched['if_score'].min())
lof_norm = (df_enriched['lof_score'] - df_enriched['lof_score'].min()) / (df_enriched['lof_score'].max() - df_enriched['lof_score'].min())
svm_norm = (df_enriched['svm_score'] - df_enriched['svm_score'].min()) / (df_enriched['svm_score'].max() - df_enriched['svm_score'].min())

df_enriched['composite_score'] = (if_norm + lof_norm + svm_norm) / 3

# Statistiques
print("\nüìä R√©partition par niveau :")
level_counts = df_enriched['anomaly_level'].value_counts()
level_pcts = (level_counts / len(df_enriched) * 100).round(2)

summary = pd.DataFrame({
    'Nombre': level_counts,
    'Pourcentage': level_pcts
})
display(summary)

# Matrice de concordance
print("\nüìã Concordance entre algorithmes :")
print(f"IF vs LOF : {((df_enriched['if_anomaly'] == df_enriched['lof_anomaly']).sum() / len(df_enriched) * 100):.1f}%")
print(f"IF vs SVM : {((df_enriched['if_anomaly'] == df_enriched['svm_anomaly']).sum() / len(df_enriched) * 100):.1f}%")
print(f"LOF vs SVM: {((df_enriched['lof_anomaly'] == df_enriched['svm_anomaly']).sum() / len(df_enriched) * 100):.1f}%")

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution par niveau
level_counts.plot(kind='bar', ax=axes[0], color=['lightgreen', 'yellow', 'orange', 'red'])
axes[0].set_title('Distribution par niveau d\'anomalie')
axes[0].set_ylabel('Nombre de DAB')
axes[0].set_xlabel('')
axes[0].tick_params(axis='x', rotation=45)

# Distribution du score composite
for level in ['Normal', 'Anomalie_Faible', 'Anomalie_Mod√©r√©e', 'Anomalie_Forte']:
    if level in df_enriched['anomaly_level'].values:
        subset = df_enriched[df_enriched['anomaly_level'] == level]['composite_score']
        axes[1].hist(subset, bins=30, alpha=0.6, label=level)
axes[1].set_xlabel('Score composite')
axes[1].set_ylabel('Fr√©quence')
axes[1].set_title('Distribution du score composite')
axes[1].legend()

plt.tight_layout()
plt.show()

## 6.5 Visualisation PCA

In [None]:
# R√©duction de dimension avec PCA
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(df_scaled)

df_enriched['PC1'] = X_pca[:, 0]
df_enriched['PC2'] = X_pca[:, 1]

var_exp = pca.explained_variance_ratio_
print(f"\nüìà Variance expliqu√©e : PC1={var_exp[0]*100:.1f}% | PC2={var_exp[1]*100:.1f}% | Total={sum(var_exp)*100:.1f}%")

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: Par niveau d'anomalie
colors_map = {'Normal': 'green', 'Anomalie_Faible': 'yellow',
              'Anomalie_Mod√©r√©e': 'orange', 'Anomalie_Forte': 'red'}
for level, color in colors_map.items():
    mask = df_enriched['anomaly_level'] == level
    if mask.any():
        axes[0].scatter(df_enriched.loc[mask, 'PC1'],
                       df_enriched.loc[mask, 'PC2'],
                       c=color, alpha=0.5, s=30, label=level)
axes[0].set_xlabel(f'PC1 ({var_exp[0]*100:.1f}%)')
axes[0].set_ylabel(f'PC2 ({var_exp[1]*100:.1f}%)')
axes[0].set_title('Anomalies dans l\'espace PCA')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Plot 2: Score composite
scatter = axes[1].scatter(df_enriched['PC1'], df_enriched['PC2'],
                         c=df_enriched['composite_score'],
                         cmap='RdYlGn_r', alpha=0.6, s=30)
axes[1].set_xlabel(f'PC1 ({var_exp[0]*100:.1f}%)')
axes[1].set_ylabel(f'PC2 ({var_exp[1]*100:.1f}%)')
axes[1].set_title('Score composite')
plt.colorbar(scatter, ax=axes[1], label='Score')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

---
# üí° SECTION 7 : Interpr√©tabilit√© et Explications

In [None]:
print("="*80)
print(" "*20 + "üí° G√âN√âRATION DES EXPLICATIONS")
print("="*80)

# Calcul des percentiles de r√©f√©rence
percentiles = {}
ref_cols = ['montant_moyen_par_retrait', 'montant_total', 'nb_total_de_retraits', 'ratio_observe_vs_attendu']
ref_cols = [c for c in ref_cols if c in df_enriched.columns]

for col in ref_cols:
    percentiles[(col, 'p05')] = df_enriched[col].quantile(0.05)
    percentiles[(col, 'p95')] = df_enriched[col].quantile(0.95)

# Fonction d'explication
def generate_explanation(row):
    reasons = []

    # Montant moyen par retrait
    if 'montant_moyen_par_retrait' in row.index and pd.notna(row['montant_moyen_par_retrait']):
        if row['montant_moyen_par_retrait'] < percentiles.get(('montant_moyen_par_retrait','p05'), 0):
            reasons.append("Montant moyen tr√®s faible (fractionnement suspect)")
        elif row['montant_moyen_par_retrait'] > percentiles.get(('montant_moyen_par_retrait','p95'), float('inf')):
            reasons.append("Montant moyen tr√®s √©lev√© (retraits unitaires atypiques)")

    # Volume total
    if 'montant_total' in row.index and pd.notna(row['montant_total']):
        if row['montant_total'] < percentiles.get(('montant_total','p05'), 0):
            reasons.append("Volume total tr√®s faible")
        elif row['montant_total'] > percentiles.get(('montant_total','p95'), float('inf')):
            reasons.append("Volume total tr√®s √©lev√©")

    # Nombre de retraits
    if 'nb_total_de_retraits' in row.index and pd.notna(row['nb_total_de_retraits']):
        if row['nb_total_de_retraits'] < percentiles.get(('nb_total_de_retraits','p05'), 0):
            reasons.append("Tr√®s faible activit√© transactionnelle")
        elif row['nb_total_de_retraits'] > percentiles.get(('nb_total_de_retraits','p95'), float('inf')):
            reasons.append("Activit√© transactionnelle tr√®s √©lev√©e")

    # Type de site
    if 'dab_hos_site' in row.index and str(row['dab_hos_site']).upper() == 'H':
        reasons.append("DAB hors site (profil de risque diff√©rent)")

    # Activit√©
    if 'is_low_activity' in row.index and row['is_low_activity'] == 1:
        reasons.append("DAB √† tr√®s faible activit√© (P10)")
    if 'is_high_activity' in row.index and row['is_high_activity'] == 1:
        reasons.append("DAB √† tr√®s forte activit√© (P90)")

    return " | ".join(reasons) if reasons else "Profil rare multi-dimensionnel"

# G√©n√©rer explications pour les anomalies
df_enriched['explanation'] = df_enriched.apply(
    lambda row: generate_explanation(row) if row['anomaly_level'] != 'Normal' else '',
    axis=1
)

print("\n‚úÖ Explications g√©n√©r√©es pour toutes les anomalies")

In [None]:
# Top anomalies avec explications
print("="*80)
print(" "*22 + "üîù TOP 50 ANOMALIES D√âTECT√âES")
print("="*80)

anomalies = df_enriched[df_enriched['anomaly_level'] != 'Normal'].copy()
print(f"\nüìä Total anomalies : {len(anomalies):,}")
print(f"   - Fortes   : {(df_enriched['anomaly_level'] == 'Anomalie_Forte').sum():,}")
print(f"   - Mod√©r√©es : {(df_enriched['anomaly_level'] == 'Anomalie_Mod√©r√©e').sum():,}")
print(f"   - Faibles  : {(df_enriched['anomaly_level'] == 'Anomalie_Faible').sum():,}")

if len(anomalies) > 0:
    # Trier par score composite
    anomalies_sorted = anomalies.sort_values('composite_score', ascending=False)

    # Colonnes √† afficher
    display_cols = [
        'num_automate', 'lib_site_implementation', 'code_banque', 'type_carte',
        'montant_total', 'nb_total_de_retraits', 'montant_moyen_par_retrait',
        'dab_hos_site', 'anomaly_level', 'composite_score', 'n_algos_detecting', 'explanation'
    ]
    display_cols = [c for c in display_cols if c in anomalies_sorted.columns]

    print("\nüìã TOP 50 ANOMALIES (tri√©es par score d√©croissant) :")
    pd.set_option('display.max_colwidth', 100)
    display(anomalies_sorted[display_cols].head(50))
    pd.set_option('display.max_colwidth', 50)

    # Statistiques sur les raisons
    print("\nüìä Raisons d'anomalies les plus fr√©quentes :")
    all_reasons = []
    for expl in anomalies['explanation']:
        if pd.notna(expl) and expl:
            all_reasons.extend([r.strip() for r in expl.split('|')])

    if all_reasons:
        reason_counts = pd.Series(all_reasons).value_counts().head(10)
        reason_pcts = (reason_counts / len(anomalies) * 100).round(2)

        reason_summary = pd.DataFrame({
            'Nombre': reason_counts,
            'Pourcentage': reason_pcts
        })
        display(reason_summary)

---
# ‚úÖ SECTION 8 : Validation Statistique

In [None]:
print("="*80)
print(" "*20 + "üìä VALIDATION STATISTIQUE")
print("="*80)

# Comparaison distributions
val_cols = [c for c in ['montant_total', 'nb_total_de_retraits',
                        'montant_moyen_par_retrait', 'ratio_observe_vs_attendu']
           if c in df_enriched.columns]

if val_cols and len(anomalies) > 0:
    normal_data = df_enriched[df_enriched['anomaly_level'] == 'Normal']

    comparison = pd.DataFrame({
        'mean_normal': normal_data[val_cols].mean(),
        'mean_anomaly': anomalies[val_cols].mean(),
        'median_normal': normal_data[val_cols].median(),
        'median_anomaly': anomalies[val_cols].median(),
        'p95_normal': normal_data[val_cols].quantile(0.95),
        'p95_anomaly': anomalies[val_cols].quantile(0.95)
    })

    print("\nüìà Comparaison Normal vs Anomalie :")
    display(comparison)

    # Tests statistiques (Mann-Whitney U)
    print("\nüî¨ Tests statistiques (Mann-Whitney U) :")
    print("H0: Distributions identiques | H1: Distributions diff√©rentes\n")

    test_results = []
    for col in val_cols:
        try:
            statistic, p_value = stats.mannwhitneyu(
                normal_data[col].dropna(),
                anomalies[col].dropna(),
                alternative='two-sided'
            )
            test_results.append({
                'Variable': col,
                'p-value': p_value,
                'Significatif (Œ±=0.05)': 'Oui ‚úì' if p_value < 0.05 else 'Non ‚úó'
            })
        except:
            pass

    if test_results:
        test_df = pd.DataFrame(test_results)
        display(test_df)

        sig_count = (test_df['p-value'] < 0.05).sum()
        print(f"\n‚úÖ {sig_count}/{len(test_results)} variables avec diff√©rence significative")

In [None]:
# Test de stabilit√©
print("\n" + "="*80)
print(" "*22 + "üîÑ TEST DE STABILIT√â")
print("="*80)

def get_top_anomalies_by_contamination(contamination, top_n=30):
    iso_tmp = IsolationForest(
        n_estimators=500,
        contamination=contamination,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    pred_tmp = iso_tmp.fit_predict(df_scaled)
    score_tmp = -iso_tmp.score_samples(df_scaled)

    tmp_df = df_enriched.copy()
    tmp_df['score_tmp'] = score_tmp

    if 'num_automate' in tmp_df.columns:
        return set(tmp_df.nlargest(top_n, 'score_tmp')['num_automate'].values)
    else:
        return set(tmp_df.nlargest(top_n, 'score_tmp').index)

# Test avec diff√©rents niveaux
contamination_levels = [0.03, 0.05, 0.08, 0.10]
anomaly_sets = {}

print(f"\nüéØ Test avec {len(contamination_levels)} niveaux de contamination\n")
for cont in contamination_levels:
    anomaly_sets[cont] = get_top_anomalies_by_contamination(cont, top_n=30)
    print(f"   {cont*100}% : {len(anomaly_sets[cont])} anomalies")

# Anomalies core (d√©tect√©es par tous)
core_anomalies = set.intersection(*anomaly_sets.values())
print(f"\nüéØ Anomalies CORE (d√©tect√©es √† tous les niveaux) : {len(core_anomalies)}")

# Intersection
print("\nüîó Intersections :")
print(f"   3% ‚à© 5%  : {len(anomaly_sets[0.03] & anomaly_sets[0.05])}")
print(f"   5% ‚à© 8%  : {len(anomaly_sets[0.05] & anomaly_sets[0.08])}")
print(f"   8% ‚à© 10% : {len(anomaly_sets[0.08] & anomaly_sets[0.10])}")

print("\nüí° Les anomalies CORE sont les plus robustes et critiques")

---
# üíæ SECTION 9 : Export des R√©sultats

‚ö†Ô∏è **ACTION REQUISE** : Cr√©ez d'abord un dataset vide dans Dataiku, puis modifiez `OUTPUT_DATASET_NAME`

In [None]:
# Pr√©paration du dataset d'export
export_cols = [
    # Identifiants
    'num_automate', 'lib_site_implementation', 'code_banque', 'code_postale_emplacement',
    # Variables m√©tier
    'type_carte', 'montant_total', 'nb_total_de_retraits', 'dab_hos_site',
    # Features engineered
    'montant_moyen_par_retrait', 'ratio_montant_vs_median', 'ratio_observe_vs_attendu',
    # R√©sultats d√©tection
    'if_anomaly', 'lof_anomaly', 'svm_anomaly', 'anomaly_level', 'composite_score',
    'if_score', 'lof_score', 'svm_score', 'n_algos_detecting',
    # Explications
    'explanation'
]

export_cols = [c for c in export_cols if c in df_enriched.columns]
df_export = df_enriched[export_cols].copy()

# Renommer pour clart√©
rename_map = {
    'if_anomaly': 'anomaly_isolation_forest',
    'lof_anomaly': 'anomaly_lof',
    'svm_anomaly': 'anomaly_svm',
    'anomaly_level': 'niveau_anomalie',
    'composite_score': 'score_composite',
    'explanation': 'explication'
}
df_export.rename(columns=rename_map, inplace=True)

print("üì¶ Dataset d'export pr√©par√©")
print(f"üìè Dimensions : {df_export.shape[0]:,} √ó {df_export.shape[1]}")
print(f"\nüìã Colonnes export√©es ({len(df_export.columns)}) :")
for i, col in enumerate(df_export.columns, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Export vers Dataiku
OUTPUT_DATASET_NAME = "DAB_ANOMALIES_RESULTATS"  # ‚ö†Ô∏è MODIFIER ICI

try:
    output_dataset = dataiku.Dataset(OUTPUT_DATASET_NAME)
    output_dataset.write_with_schema(df_export)

    print(f"\n‚úÖ Export r√©ussi vers '{OUTPUT_DATASET_NAME}'")
    print(f"üìä {len(df_export):,} lignes export√©es")
    print(f"\nüìà Statistiques :")
    print(f"   - Anomalies fortes   : {(df_export['niveau_anomalie'] == 'Anomalie_Forte').sum():,}")
    print(f"   - Anomalies mod√©r√©es : {(df_export['niveau_anomalie'] == 'Anomalie_Mod√©r√©e').sum():,}")
    print(f"   - Anomalies faibles  : {(df_export['niveau_anomalie'] == 'Anomalie_Faible').sum():,}")
    print(f"   - Normaux            : {(df_export['niveau_anomalie'] == 'Normal').sum():,}")

except Exception as e:
    print(f"\n‚ùå Erreur d'export : {e}")
    print("\nüí° Solutions :")
    print("   1. Cr√©ez le dataset dans Dataiku")
    print("   2. V√©rifiez le nom du dataset")
    print("   3. V√©rifiez vos permissions")
    print("\nüì¶ Les r√©sultats restent disponibles dans df_export")

---
# üìä SECTION 10 : R√©sum√© Ex√©cutif et Recommandations

In [None]:
print("="*80)
print(" "*15 + "üìä R√âSUM√â EX√âCUTIF - D√âTECTION D'ANOMALIES DAB")
print("="*80)

# 1. Volum√©trie
print("\n1Ô∏è‚É£ VOLUM√âTRIE")
print("="*60)
print(f"Total DAB analys√©s           : {len(df):,}")
print(f"Features utilis√©es           : {len(model_features)}")
print(f"Algorithmes de d√©tection     : 3 (IF + LOF + One-Class SVM)")

# 2. R√©sultats
print("\n2Ô∏è‚É£ R√âSULTATS DE D√âTECTION")
print("="*60)
n_forte = (df_enriched['anomaly_level'] == 'Anomalie_Forte').sum()
n_moderee = (df_enriched['anomaly_level'] == 'Anomalie_Mod√©r√©e').sum()
n_faible = (df_enriched['anomaly_level'] == 'Anomalie_Faible').sum()
n_normal = (df_enriched['anomaly_level'] == 'Normal').sum()

print(f"üî¥ Anomalies FORTES (3 algos)   : {n_forte:,} ({n_forte/len(df)*100:.2f}%)")
print(f"üü† Anomalies MOD√âR√âES (2 algos) : {n_moderee:,} ({n_moderee/len(df)*100:.2f}%)")
print(f"üü° Anomalies FAIBLES (1 algo)   : {n_faible:,} ({n_faible/len(df)*100:.2f}%)")
print(f"üü¢ DAB NORMAUX                  : {n_normal:,} ({n_normal/len(df)*100:.2f}%)")

# 3. Principaux patterns
print("\n3Ô∏è‚É£ PATTERNS D'ANOMALIES IDENTIFI√âS")
print("="*60)
if 'all_reasons' in locals() and all_reasons:
    top_reasons = pd.Series(all_reasons).value_counts().head(5)
    for i, (reason, count) in enumerate(top_reasons.items(), 1):
        pct = (count / len(anomalies) * 100).round(1)
        print(f"{i}. {reason}")
        print(f"   ‚Üí {count:,} DAB ({pct}% des anomalies)\n")

# 4. Stabilit√©
print("4Ô∏è‚É£ ROBUSTESSE DES D√âTECTIONS")
print("="*60)
if 'core_anomalies' in locals():
    print(f"Anomalies CORE (stables) : {len(core_anomalies)} DAB")
    print(f"‚Üí D√©tect√©es peu importe le param√©trage (haute confiance)")

# 5. Recommandations
print("\n" + "="*80)
print("5Ô∏è‚É£ RECOMMANDATIONS OP√âRATIONNELLES")
print("="*80)
print("""
üéØ PRIORISATION DES ACTIONS :

PRIORIT√â 1 - Anomalies FORTES (consensus 3 algorithmes)
   ‚îú‚îÄ Investigation imm√©diate obligatoire
   ‚îú‚îÄ V√©rification logs syst√®me et historique
   ‚îú‚îÄ Audit de s√©curit√© si suspicion fraude
   ‚îî‚îÄ Blocage pr√©ventif si risque √©lev√©

PRIORIT√â 2 - Anomalies CORE (stables)
   ‚îú‚îÄ Anomalies persistantes = action requise
   ‚îú‚îÄ Analyse des causes racines
   ‚îî‚îÄ Plan d'action correctif

PRIORIT√â 3 - Anomalies MOD√âR√âES (2 algorithmes)
   ‚îú‚îÄ Surveillance renforc√©e
   ‚îú‚îÄ Investigation si r√©currence
   ‚îî‚îÄ Cat√©gorisation m√©tier

PRIORIT√â 4 - Anomalies FAIBLES (1 algorithme)
   ‚îú‚îÄ Monitoring passif
   ‚îî‚îÄ Escalade si √©volution vers niveau sup√©rieur

üîÑ ACTIONS DE SUIVI :

1. Monitoring continu
   ‚îî‚îÄ R√©-ex√©cution hebdomadaire/mensuelle de l'analyse

2. Enrichissement des donn√©es
   ‚îú‚îÄ Ajouter donn√©es temporelles (jour/heure/saison)
   ‚îú‚îÄ Int√©grer g√©olocalisation enrichie
   ‚îú‚îÄ Historiser pour d√©tection de tendances
   ‚îî‚îÄ Ajouter donn√©es externes (m√©t√©o, √©v√©nements)

3. Feedback m√©tier
   ‚îú‚îÄ Valider anomalies avec √©quipes terrain
   ‚îú‚îÄ Documenter faux positifs
   ‚îú‚îÄ Ajuster seuils si n√©cessaire
   ‚îî‚îÄ Cr√©er taxonomie des anomalies

4. Am√©lioration continue
   ‚îú‚îÄ Tester algorithmes suppl√©mentaires (DBSCAN, Autoencoder)
   ‚îú‚îÄ Feature engineering avanc√© (r√©seaux, saisonnalit√©)
   ‚îú‚îÄ Syst√®me d'alerting automatique
   ‚îî‚îÄ Dashboard op√©rationnel temps r√©el

üí° INDICATEURS CL√âS √Ä SURVEILLER :

- Taux de confirmation des anomalies par les √©quipes m√©tier
- Temps moyen de r√©solution par niveau d'anomalie
- Impact financier des anomalies d√©tect√©es
- √âvolution du nombre d'anomalies dans le temps
""")

print("="*80)
print(" "*25 + "‚úÖ ANALYSE TERMIN√âE")
print("="*80)
print(f"\nüìÖ Date : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üíæ R√©sultats : df_export ({len(df_export):,} lignes)")
print(f"üìÅ Dataset Dataiku : {OUTPUT_DATASET_NAME}")
print("\nüéâ Analyse compl√®te et valid√©e !")
print("\nüí° Prochaines √©tapes sugg√©r√©es :")
print("   1. Examiner les top anomalies avec les √©quipes m√©tier")
print("   2. Valider les explications g√©n√©r√©es")
print("   3. Cr√©er un plan d'action par niveau d'anomalie")
print("   4. Programmer l'ex√©cution automatique hebdomadaire")
print("   5. Cr√©er un dashboard de suivi des anomalies")

---

# üìù Notes Techniques

## M√©thodologie

Cette analyse impl√©mente une approche **multi-algorithmes robuste** :

### ‚úÖ Points Forts

1. **Validation crois√©e** : 3 algorithmes compl√©mentaires (IF + LOF + One-Class SVM)
2. **Feature engineering** : 10+ features m√©tier cr√©√©es
3. **Normalisation robuste** : RobustScaler r√©sistant aux outliers
4. **Tests de stabilit√©** : V√©rification robustesse des d√©tections
5. **Validation statistique** : Tests Mann-Whitney U
6. **Interpr√©tabilit√©** : Explications automatiques par anomalie

### üîß Param√®tres Ajustables

- `CONTAMINATION` : Taux d'anomalies attendu (d√©faut 5%)
- `N_NEIGHBORS` (LOF) : Nombre de voisins (d√©faut 20)
- `nu` (One-Class SVM) : Borne sup√©rieure fraction outliers
- Features s√©lectionn√©es : Adapter selon donn√©es disponibles

### üìö Pour Aller Plus Loin

**Am√©liorations possibles** :

1. **Analyse temporelle** : Int√©grer s√©ries temporelles et saisonnalit√©
2. **Clustering** : Segmenter DAB avant d√©tection (urbain/rural)
3. **Deep Learning** : Autoencoders pour patterns complexes
4. **Graphes** : Analyser r√©seaux de DAB g√©ographiques
5. **Ensemble stacking** : Combiner pr√©dictions de mani√®re optimale

---

**Auteur** : Data Science Team  
**Version** : 2.0  
**Contact** : Pour questions ou am√©liorations