# Préparation et Nettoyage du Dataset CIC-IDS2017
Ce notebook prépare le dataset **CIC-IDS2017** pour une utilisation dans un dashboard (Power BI, Streamlit, Dash, etc.).

Il intègre les meilleures pratiques : inspection, nettoyage, simplification des labels, normalisation et export du jeu de données nettoyé.

## 1️⃣ Importation des librairies nécessaires

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import ipaddress

## 2️⃣ Chargement du fichier CSV

In [12]:
def load_security_data(file_path):
    """
    Charge les données de sécurité depuis un fichier
    Supporte CSV, JSON, Excel
    """
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.json'):
        df = pd.read_json(file_path)
    elif file_path.endswith(('.xlsx', '.xls')):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Format de fichier non supporté")
    
    print(f"✅ Données chargées: {df.shape[0]} lignes, {df.shape[1]} colonnes")
    return df

# Exemple d'utilisation
df = load_security_data('Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

✅ Données chargées: 225745 lignes, 79 colonnes


## 3️⃣ Exploration initiale des données

In [13]:
# Exploration Initiale
def explore_data(df):
    print("📊 INFORMATION GÉNÉRALE:")
    print(df.info())
    
    print("\n🔍 PREMIÈRES LIGNES:")
    display(df.head())
    
    print("\n📈 STATISTIQUES DESCRIBE:")
    display(df.describe(include='all'))
    
    print("\n❓ VALEURS MANQUANTES:")
    missing_data = df.isnull().sum()
    print(missing_data[missing_data > 0])
    
    print("\n🎯 COLONNES DISPONIBLES:")
    print(list(df.columns))

explore_data(df)

📊 INFORMATION GÉNÉRALE:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             225745 non-null  int64  
 1    Flow Duration                225745 non-null  int64  
 2    Total Fwd Packets            225745 non-null  int64  
 3    Total Backward Packets       225745 non-null  int64  
 4   Total Length of Fwd Packets   225745 non-null  int64  
 5    Total Length of Bwd Packets  225745 non-null  int64  
 6    Fwd Packet Length Max        225745 non-null  int64  
 7    Fwd Packet Length Min        225745 non-null  int64  
 8    Fwd Packet Length Mean       225745 non-null  float64
 9    Fwd Packet Length Std        225745 non-null  float64
 10  Bwd Packet Length Max         225745 non-null  int64  
 11   Bwd Packet Length Min        225745 non-null  int64  
 12   Bwd Packet Length M

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN



📈 STATISTIQUES DESCRIBE:


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,...,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745
unique,,,,,,,,,,,...,,,,,,,,,,2
top,,,,,,,,,,,...,,,,,,,,,,DDoS
freq,,,,,,,,,,,...,,,,,,,,,,128027
mean,8879.61946,16241650.0,4.874916,4.572775,939.463346,5960.477,538.535693,27.882221,164.826715,214.907242,...,21.482753,184826.1,12934.36,208084.9,177620.1,10322140.0,3611943.0,12878130.0,7755355.0,
std,19754.6474,31524370.0,15.422874,21.755356,3249.403484,39218.34,1864.128991,163.324159,504.892965,797.411073,...,4.166799,797925.0,210273.7,900235.0,784260.2,21853030.0,12756890.0,26921260.0,19831090.0,
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,80.0,71180.0,2.0,1.0,26.0,0.0,6.0,0.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
50%,80.0,1452333.0,3.0,4.0,30.0,164.0,20.0,0.0,8.666667,5.301991,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
75%,80.0,8805237.0,5.0,5.0,63.0,11601.0,34.0,6.0,32.0,10.263203,...,20.0,1878.0,0.0,1878.0,1862.0,8239725.0,0.0,8253838.0,7422849.0,



❓ VALEURS MANQUANTES:
Flow Bytes/s    4
dtype: int64

🎯 COLONNES DISPONIBLES:
[' Destination Port', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', ' SYN Fl

## 4️⃣ Analyse spécifique pour le dataset CIC-IDS2017

In [14]:
def analyze_cic_ids2017(df):
    print("=" * 70)
    print("🔍 ANALYSE CIC-IDS2017 - STRUCTURE SPÉCIFIQUE")
    print("=" * 70)
    
    print(f"📊 Dimensions: {df.shape[0]} lignes × {df.shape[1]} colonnes")
    
    # Vérifier la colonne Label (contient les types d'attaques)
    if ' Label' in df.columns:
        print(f"\n🎯 DISTRIBUTION DES ATTAQUES (colonne ' Label'):")
        label_distribution = df[' Label'].value_counts()
        print(label_distribution)
        
        # Afficher les types d'attaques
        print(f"\n🚨 TYPES D'ATTAQUES UNIQUES:")
        for label in df[' Label'].unique():
            count = (df[' Label'] == label).sum()
            print(f"  - '{label}': {count} occurrences")
    
    # Vérifier les premières lignes pour comprendre la structure
    print(f"\n📋 APERÇU DES DONNÉES (5 premières lignes):")
    display(df.head())
    
    # Vérifier les valeurs manquantes
    print(f"\n❓ VALEURS MANQUANTES PAR COLONNE:")
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0]
    if len(missing_data) > 0:
        print(missing_data)
    else:
        print("✅ Aucune valeur manquante détectée")

# Exécuter l'analyse
analyze_cic_ids2017(df)

🔍 ANALYSE CIC-IDS2017 - STRUCTURE SPÉCIFIQUE
📊 Dimensions: 225745 lignes × 79 colonnes

🎯 DISTRIBUTION DES ATTAQUES (colonne ' Label'):
 Label
DDoS      128027
BENIGN     97718
Name: count, dtype: int64

🚨 TYPES D'ATTAQUES UNIQUES:
  - 'BENIGN': 97718 occurrences
  - 'DDoS': 128027 occurrences

📋 APERÇU DES DONNÉES (5 premières lignes):


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN



❓ VALEURS MANQUANTES PAR COLONNE:
Flow Bytes/s    4
dtype: int64


## 5️⃣ Nettoyage pour le dataset CIC-IDS2017

In [15]:
# Fonction de Nettoyage des Données
def clean_cic_ids2017_data(df):
    """
    Nettoyage spécifique pour le dataset CIC-IDS2017
    Le dataset original n'a pas de colonnes IP ou timestamp explicites
    """
    df_clean = df.copy()
    
    print("🔧 DÉBUT DU NETTOYAGE CIC-IDS2017")
    print(f"Avant nettoyage: {df_clean.shape}")
    
    # 1. CORRECTION DES NOMS DE COLONNES (supprimer les espaces)
    df_clean.columns = [col.strip() for col in df_clean.columns]
    print("✅ Noms de colonnes nettoyés")
    
    # 2. GESTION DES VALEURS MANQUANTES
    # Vérifier s'il y a des valeurs manquantes
    missing_before = df_clean.isnull().sum().sum()
    if missing_before > 0:
        print(f"🧹 Suppression de {missing_before} valeurs manquantes...")
        df_clean = df_clean.dropna()
        print(f"Après suppression NA: {df_clean.shape}")
    else:
        print("✅ Aucune valeur manquante à supprimer")
    
    # 3. CRÉATION DE COLONNES SYNTHÉTIQUES POUR LE DASHBOARD
    # Comme CIC-IDS2017 n'a pas de vraies IPs/timestamps, on en crée pour la démo
    print("🎨 Création de colonnes synthétiques pour la visualisation...")
    
    # Timestamps synthétiques répartis sur 24h
    start_time = pd.Timestamp.now() - pd.Timedelta(hours=24)
    time_increments = pd.date_range(
        start=start_time, 
        periods=len(df_clean), 
        freq='S'  # Une entrée par seconde
    )
    df_clean['timestamp'] = time_increments[:len(df_clean)]
    
    # Adresses IP synthétiques pour la visualisation
    np.random.seed(42)
    df_clean['source_ip'] = [f"192.168.1.{np.random.randint(1, 50)}" for _ in range(len(df_clean))]
    df_clean['destination_ip'] = [f"10.0.0.{np.random.randint(1, 20)}" for _ in range(len(df_clean))]
    
    # 4. NETTOYAGE DE LA COLONNE LABEL
    if 'Label' in df_clean.columns:
        # Uniformiser les noms d'attaques
        df_clean['attack_type'] = df_clean['Label'].str.strip()
        
        # Créer une colonne binaire is_attack
        df_clean['is_attack'] = df_clean['attack_type'] != 'BENIGN'
        
        print(f"🎯 Distribution des attaques après nettoyage:")
        print(df_clean['attack_type'].value_counts())
    
    # 5. SUPPRESSION DES DOUBLONS EXACTS
    initial_count = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    final_count = len(df_clean)
    duplicates_removed = initial_count - final_count
    
    if duplicates_removed > 0:
        print(f"🧹 {duplicates_removed} doublons supprimés")
    
    # 6. SÉLECTION DES COLONNES IMPORTANTES POUR LE DASHBOARD
    # Garder les features les plus importantes + nos nouvelles colonnes
    important_features = [
        'timestamp', 'source_ip', 'destination_ip', 'attack_type', 'is_attack',
        'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
        'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Flow Bytes/s',
        'Flow Packets/s', 'Flow IAT Mean', 'Fwd IAT Mean', 'Bwd IAT Mean'
    ]
    
    # Filtrer pour garder seulement les colonnes disponibles
    available_features = [col for col in important_features if col in df_clean.columns]
    df_final = df_clean[available_features]
    
    print(f"🎉 NETTOYAGE TERMINÉ: {df_final.shape[0]} lignes, {df_final.shape[1]} colonnes")
    print(f"📊 Colonnes finales: {list(df_final.columns)}")
    
    return df_final

# Exécuter le nettoyage spécifique
df_clean = clean_cic_ids2017_data(df)

🔧 DÉBUT DU NETTOYAGE CIC-IDS2017
Avant nettoyage: (225745, 79)
✅ Noms de colonnes nettoyés
🧹 Suppression de 4 valeurs manquantes...
Après suppression NA: (225741, 79)
🎨 Création de colonnes synthétiques pour la visualisation...


  time_increments = pd.date_range(


🎯 Distribution des attaques après nettoyage:
attack_type
DDoS      128027
BENIGN     97714
Name: count, dtype: int64
🎉 NETTOYAGE TERMINÉ: 225741 lignes, 16 colonnes
📊 Colonnes finales: ['timestamp', 'source_ip', 'destination_ip', 'attack_type', 'is_attack', 'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Fwd IAT Mean', 'Bwd IAT Mean']


## 6️⃣ Vérification de la qualité des données nettoyées

In [16]:
# VÉRIFICATION DES RÉSULTATS
def verify_cleaned_data(df_clean):
    print("=" * 60)
    print("✅ VÉRIFICATION DES DONNÉES NETTOYÉES")
    print("=" * 60)
    
    print(f"📊 Shape final: {df_clean.shape}")
    print(f"🎯 Colonnes: {list(df_clean.columns)}")
    
    # Vérifier les nouvelles colonnes
    if 'timestamp' in df_clean.columns:
        print(f"\n⏰ Plage temporelle:")
        print(f"   Début: {df_clean['timestamp'].min()}")
        print(f"   Fin: {df_clean['timestamp'].max()}")
    
    if 'attack_type' in df_clean.columns:
        print(f"\n🚨 DISTRIBUTION DES ATTAQUES:")
        attack_dist = df_clean['attack_type'].value_counts()
        for attack, count in attack_dist.items():
            percentage = (count / len(df_clean)) * 100
            print(f"   {attack:<25} {count:>6} ({percentage:.1f}%)")
    
    if 'is_attack' in df_clean.columns:
        attack_rate = df_clean['is_attack'].mean() * 100
        print(f"\n📈 Taux d'attaques: {attack_rate:.2f}%")
    
    # Vérifier qu'il n'y a plus de valeurs manquantes
    missing_total = df_clean.isnull().sum().sum()
    print(f"\n🔍 Valeurs manquantes totales: {missing_total}")

# Exécuter la vérification
verify_cleaned_data(df_clean)

✅ VÉRIFICATION DES DONNÉES NETTOYÉES
📊 Shape final: (225741, 16)
🎯 Colonnes: ['timestamp', 'source_ip', 'destination_ip', 'attack_type', 'is_attack', 'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Fwd IAT Mean', 'Bwd IAT Mean']

⏰ Plage temporelle:
   Début: 2025-10-31 16:02:14.096475
   Fin: 2025-11-03 06:44:34.096475

🚨 DISTRIBUTION DES ATTAQUES:
   DDoS                      128027 (56.7%)
   BENIGN                     97714 (43.3%)

📈 Taux d'attaques: 56.71%

🔍 Valeurs manquantes totales: 0


## 7️⃣ Création d'un Dataset Multi-Attaques à partir du DDoS

In [17]:
def enhance_ddos_with_synthetic_attacks(df_ddos):
    """Enrichit votre dataset DDoS avec d'autres attaques synthétiques"""
    print("🎨 CRÉATION D'UN DATASET MULTI-ATTAQUES...")
    
    # Faire une copie
    df_enhanced = df_ddos.copy()
    
    # Identifier le traffic normal et DDoS
    benign_mask = df_enhanced['attack_type'] == 'BENIGN'
    ddos_mask = df_enhanced['attack_type'] == 'DDoS'
    
    print(f"📊 Distribution initiale:")
    print(f"   BENIGN: {benign_mask.sum()} événements")
    print(f"   DDoS: {ddos_mask.sum()} événements")
    
    # Créer d'autres types d'attaques à partir du DDoS existant
    attack_types = ['PortScan', 'BruteForce', 'Botnet', 'WebAttack', 'Malware']
    
    # Convertir une partie des DDoS en autres attaques
    ddos_indices = df_enhanced[ddos_mask].index
    num_to_convert = len(ddos_indices) // 3  # Convertir 1/3 des DDoS
    
    if num_to_convert > 0:
        convert_indices = np.random.choice(ddos_indices, num_to_convert, replace=False)
        
        for i, idx in enumerate(convert_indices):
            new_attack = attack_types[i % len(attack_types)]
            df_enhanced.at[idx, 'attack_type'] = new_attack
            
            # Modifier légèrement les features pour correspondre au type d'attaque
            if new_attack == 'PortScan':
                df_enhanced.at[idx, 'Destination Port'] = np.random.randint(1, 1000)
            elif new_attack == 'BruteForce':
                df_enhanced.at[idx, 'Flow Duration'] = df_enhanced.at[idx, 'Flow Duration'] * 0.1  # Plus court
    
    # Réorganiser les timestamps pour étaler les attaques
    df_enhanced = df_enhanced.sort_values('timestamp').reset_index(drop=True)
    
    print(f"\n✅ DISTRIBUTION FINALE:")
    attack_counts = df_enhanced['attack_type'].value_counts()
    for attack, count in attack_counts.items():
        percentage = (count / len(df_enhanced)) * 100
        print(f"   {attack:<15}: {count:>6} ({percentage:.1f}%)")
    
    return df_enhanced

# Appliquer l'enrichissement
df_multi_attack_clean = enhance_ddos_with_synthetic_attacks(df_clean)
display(df_multi_attack_clean.head())

🎨 CRÉATION D'UN DATASET MULTI-ATTAQUES...
📊 Distribution initiale:
   BENIGN: 97714 événements
   DDoS: 128027 événements


  df_enhanced.at[idx, 'Flow Duration'] = df_enhanced.at[idx, 'Flow Duration'] * 0.1  # Plus court



✅ DISTRIBUTION FINALE:
   BENIGN         :  97714 (43.3%)
   DDoS           :  85352 (37.8%)
   Malware        :   8535 (3.8%)
   BruteForce     :   8535 (3.8%)
   WebAttack      :   8535 (3.8%)
   PortScan       :   8535 (3.8%)
   Botnet         :   8535 (3.8%)


Unnamed: 0,timestamp,source_ip,destination_ip,attack_type,is_attack,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Fwd IAT Mean,Bwd IAT Mean
0,2025-10-31 16:02:14.096475,192.168.1.39,10.0.0.2,BENIGN,False,54865,3.0,2,0,12,0,4000000.0,666666.6667,3.0,3.0,0.0
1,2025-10-31 16:02:15.096475,192.168.1.29,10.0.0.17,BENIGN,False,55054,109.0,1,1,6,6,110091.7,18348.62385,109.0,0.0,0.0
2,2025-10-31 16:02:16.096475,192.168.1.15,10.0.0.18,BENIGN,False,55055,52.0,1,1,6,6,230769.2,38461.53846,52.0,0.0,0.0
3,2025-10-31 16:02:17.096475,192.168.1.43,10.0.0.18,BENIGN,False,46236,34.0,1,1,6,6,352941.2,58823.52941,34.0,0.0,0.0
4,2025-10-31 16:02:18.096475,192.168.1.8,10.0.0.19,BENIGN,False,54863,3.0,2,0,12,0,4000000.0,666666.6667,3.0,3.0,0.0


## 8️⃣ Sauvegarde les données CIC-IDS2017 nettoyées

In [18]:
def save_cic_data(df_multi_attack_clean, sample_size=10000):
    
    # Sauvegarde des données complètes
    full_path = 'cic_ids2017_cleaned.csv'
    df_multi_attack_clean.to_csv(full_path, index=False)
    
    # Créer un échantillon pour les tests
    if len(df_multi_attack_clean) > sample_size:
        df_sample = df_multi_attack_clean.sample(n=sample_size, random_state=42)
    else:
        df_sample = df_multi_attack_clean.copy()
    
    sample_path = 'cic_ids2017_sample.csv'
    df_sample.to_csv(sample_path, index=False)
    
    print("💾 SAUVEGARDE RÉUSSIE!")
    print(f"   📁 Données complètes: {full_path} ({len(df_multi_attack_clean)} lignes)")
    print(f"   📁 Échantillon tests: {sample_path} ({len(df_sample)} lignes)")
    
    return df_sample

# Sauvegarder les données
df_sample = save_cic_data(df_multi_attack_clean)

💾 SAUVEGARDE RÉUSSIE!
   📁 Données complètes: cic_ids2017_cleaned.csv (225741 lignes)
   📁 Échantillon tests: cic_ids2017_sample.csv (10000 lignes)


## 9️⃣ Aperçu des données pour le dashboard

In [19]:
# APERÇU FINAL POUR LE DASHBOARD
print("🎊 PRÉPARATION TERMINÉE - PRÊT POUR LE DASHBOARD!")
print(f"📊 Données disponibles: {df_multi_attack_clean.shape[0]} événements de sécurité")
print(f"🎯 Colonnes disponibles pour la visualisation:")

for col in df_multi_attack_clean.columns:
    dtype = df_multi_attack_clean[col].dtype
    unique_vals = df_multi_attack_clean[col].nunique() if df_multi_attack_clean[col].dtype == 'object' else 'Numerique'
    print(f"   • {col:<25} {str(dtype):<15} ({unique_vals})")

# Aperçu des données
print(f"\n👀 APERÇU DES DONNÉES FINALES:")
display(df_multi_attack_clean.head(3))

🎊 PRÉPARATION TERMINÉE - PRÊT POUR LE DASHBOARD!
📊 Données disponibles: 225741 événements de sécurité
🎯 Colonnes disponibles pour la visualisation:
   • timestamp                 datetime64[ns]  (Numerique)
   • source_ip                 object          (49)
   • destination_ip            object          (19)
   • attack_type               object          (7)
   • is_attack                 bool            (Numerique)
   • Destination Port          int64           (Numerique)
   • Flow Duration             float64         (Numerique)
   • Total Fwd Packets         int64           (Numerique)
   • Total Backward Packets    int64           (Numerique)
   • Total Length of Fwd Packets int64           (Numerique)
   • Total Length of Bwd Packets int64           (Numerique)
   • Flow Bytes/s              float64         (Numerique)
   • Flow Packets/s            float64         (Numerique)
   • Flow IAT Mean             float64         (Numerique)
   • Fwd IAT Mean              float64      

Unnamed: 0,timestamp,source_ip,destination_ip,attack_type,is_attack,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Fwd IAT Mean,Bwd IAT Mean
0,2025-10-31 16:02:14.096475,192.168.1.39,10.0.0.2,BENIGN,False,54865,3.0,2,0,12,0,4000000.0,666666.6667,3.0,3.0,0.0
1,2025-10-31 16:02:15.096475,192.168.1.29,10.0.0.17,BENIGN,False,55054,109.0,1,1,6,6,110091.7,18348.62385,109.0,0.0,0.0
2,2025-10-31 16:02:16.096475,192.168.1.15,10.0.0.18,BENIGN,False,55055,52.0,1,1,6,6,230769.2,38461.53846,52.0,0.0,0.0
