# Exploration du dataset Google Cluster 2011

**Objectif** : Analyser les arriv√©es et √©v√©nements des jobs

**Focus** : 
- `job_id` : Identifiant unique du job
- `event_type` : Type d'√©v√©nement (SUBMIT, FINISH, FAIL, LOST, EVICT)
- `timestamp_us` : Horodatage en microsecondes

---

## 1. Configuration et imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configuration des graphiques
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

print("‚úÖ Imports r√©ussis")

## 2. D√©finition des types d'√©v√©nements

In [None]:
# Mapping des event_types selon la documentation Google Cluster 2011
EVENT_TYPES = {
    0: 'SUBMIT',          # Job soumis au cluster
    1: 'SCHEDULE',        # Job assign√© √† une machine
    2: 'EVICT',           # Job expuls√© (pr√©empt√©)
    3: 'FAIL',            # Job √©chou√©
    4: 'FINISH',          # Job termin√© avec succ√®s
    5: 'KILL',            # Job tu√© manuellement
    6: 'LOST',            # Job perdu (machine crash)
    7: 'UPDATE_PENDING',  # Mise √† jour en attente
    8: 'UPDATE_RUNNING'   # Mise √† jour pendant ex√©cution
}

# Colonnes du dataset
COLUMNS = [
    'timestamp',           # Microsecondes depuis epoch
    'missing_info',        # Bitmap d'info manquante
    'job_id',             # ID unique du job
    'event_type',         # Type d'√©v√©nement (0-8)
    'user',               # Hash de l'utilisateur
    'scheduling_class',   # Classe de scheduling (0-3)
    'job_name',           # Hash du nom du job
    'logical_job_name'    # Hash du nom logique
]

print("Event types disponibles:")
for code, name in EVENT_TYPES.items():
    print(f"  {code}: {name}")

## 3. Chargement des donn√©es

In [None]:
# Trouver les fichiers
data_dir = Path('data/raw/2011')
files = sorted(data_dir.glob('part-*.csv.gz'))

print(f"üìÅ Dossier: {data_dir}")
print(f"üìä Fichiers trouv√©s: {len(files)}")

if len(files) == 0:
    print("\n‚ùå ERREUR: Aucun fichier trouv√©!")
    print("V√©rifiez que les fichiers part-*.csv.gz sont dans data/raw/2011/")
else:
    print(f"\n‚úÖ Premiers fichiers:")
    for f in files[:5]:
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  ‚Ä¢ {f.name} ({size_mb:.1f} MB)")

### 3.1 Inspection du format d'un fichier

In [None]:
# Inspecter le premier fichier
if files:
    first_file = files[0]
    print(f"üîç Inspection de: {first_file.name}\n")
    
    with gzip.open(first_file, 'rt') as f:
        for i, line in enumerate(f):
            if i >= 3:
                break
            print(f"Ligne {i}:")
            print(f"  {line.strip()[:200]}")
            print(f"  ‚Üí Virgules: {line.count(',')}, Longueur: {len(line.strip())}\n")

### 3.2 Chargement d'un √©chantillon

In [None]:
def load_sample(filepath, n_rows=10000):
    """
    Charge un √©chantillon d'un fichier.
    """
    try:
        with gzip.open(filepath, 'rt') as f:
            df = pd.read_csv(
                f,
                header=None,
                names=COLUMNS,
                usecols=['timestamp', 'job_id', 'event_type'],
                dtype={
                    'timestamp': 'int64',
                    'job_id': 'int64',
                    'event_type': 'int32'
                },
                nrows=n_rows,
                sep=',',
                on_bad_lines='skip'
            )
        return df
    except Exception as e:
        print(f"‚ùå Erreur: {e}")
        return pd.DataFrame()

# Charger un √©chantillon du premier fichier
print("Chargement d'un √©chantillon (10,000 lignes)...")
df_sample = load_sample(files[0])

print(f"\n‚úÖ Charg√©: {len(df_sample):,} lignes")
print(f"\nColonnes: {df_sample.columns.tolist()}")
print(f"\nTypes de donn√©es:")
print(df_sample.dtypes)

In [None]:
# Afficher les premi√®res lignes
print("üìä Premi√®res lignes du dataset:\n")
df_sample.head(10)

### 3.3 Conversion des timestamps

In [None]:
# Convertir timestamp en datetime
df_sample['datetime'] = pd.to_datetime(df_sample['timestamp'], unit='us', errors='coerce')

# Ajouter le nom de l'event_type
df_sample['event_name'] = df_sample['event_type'].map(EVENT_TYPES)

print("‚úÖ Conversion des timestamps termin√©e\n")
print(f"P√©riode couverte:")
print(f"  D√©but: {df_sample['datetime'].min()}")
print(f"  Fin:   {df_sample['datetime'].max()}")
print(f"  Dur√©e: {df_sample['datetime'].max() - df_sample['datetime'].min()}")

# Afficher avec datetime
df_sample[['job_id', 'event_type', 'event_name', 'timestamp', 'datetime']].head(10)

## 4. Analyse des event_types

### 4.1 Distribution des types d'√©v√©nements

In [None]:
# Compter les √©v√©nements par type
event_counts = df_sample['event_type'].value_counts().sort_index()

print("Distribution des √©v√©nements:\n")
print("="*60)
for event_type, count in event_counts.items():
    event_name = EVENT_TYPES.get(event_type, 'UNKNOWN')
    percentage = count / len(df_sample) * 100
    print(f"{event_type} - {event_name:20s}: {count:>8,} ({percentage:>5.2f}%)")
print("="*60)
print(f"TOTAL: {len(df_sample):,} √©v√©nements")

In [None]:
# Visualisation de la distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Graphique √† barres
ax1 = axes[0]
event_names = [EVENT_TYPES[e] for e in event_counts.index]
colors = sns.color_palette("husl", len(event_counts))

bars = ax1.bar(event_names, event_counts.values, color=colors, edgecolor='black', linewidth=1.5)
ax1.set_title('Distribution des types d\'√©v√©nements', fontsize=14, fontweight='bold')
ax1.set_xlabel('Type d\'√©v√©nement')
ax1.set_ylabel('Nombre d\'√©v√©nements')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3, axis='y')

# Ajouter les valeurs sur les barres
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}',
            ha='center', va='bottom', fontsize=9)

# Camembert
ax2 = axes[1]
wedges, texts, autotexts = ax2.pie(
    event_counts.values,
    labels=event_names,
    autopct='%1.1f%%',
    colors=colors,
    startangle=90,
    textprops={'fontsize': 10}
)
ax2.set_title('Proportion des √©v√©nements', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('results/figures/event_type_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: results/figures/event_type_distribution.png")

### 4.2 Focus sur les √©v√©nements cl√©s

In [None]:
# Extraire les √©v√©nements principaux
submit_events = df_sample[df_sample['event_type'] == 0]  # SUBMIT
finish_events = df_sample[df_sample['event_type'] == 4]  # FINISH
fail_events = df_sample[df_sample['event_type'] == 3]    # FAIL
evict_events = df_sample[df_sample['event_type'] == 2]   # EVICT
lost_events = df_sample[df_sample['event_type'] == 6]    # LOST

print("√âv√©nements cl√©s:\n")
print(f"üì• SUBMIT (arriv√©es):  {len(submit_events):>8,} √©v√©nements")
print(f"‚úÖ FINISH (succ√®s):    {len(finish_events):>8,} √©v√©nements")
print(f"‚ùå FAIL (√©checs):      {len(fail_events):>8,} √©v√©nements")
print(f"‚è∏Ô∏è  EVICT (pr√©emption): {len(evict_events):>8,} √©v√©nements")
print(f"üíî LOST (perdus):      {len(lost_events):>8,} √©v√©nements")

# Taux de r√©ussite/√©chec
if len(submit_events) > 0:
    success_rate = len(finish_events) / len(submit_events) * 100
    failure_rate = len(fail_events) / len(submit_events) * 100
    eviction_rate = len(evict_events) / len(submit_events) * 100
    
    print(f"\nüìä Statistiques (par rapport aux SUBMIT):")
    print(f"  Taux de succ√®s:    {success_rate:>6.2f}%")
    print(f"  Taux d'√©chec:      {failure_rate:>6.2f}%")
    print(f"  Taux de pr√©emption: {eviction_rate:>6.2f}%")

## 5. Analyse des arriv√©es de jobs (SUBMIT)

### 5.1 S√©rie temporelle des arriv√©es

In [None]:
# Extraire les arriv√©es
arrivals = df_sample[df_sample['event_type'] == 0].copy()
arrivals = arrivals.sort_values('timestamp')

print(f"üì• Arriv√©es de jobs: {len(arrivals):,}")
print(f"   Jobs uniques: {arrivals['job_id'].nunique():,}")
print(f"   P√©riode: {arrivals['datetime'].min()} ‚Üí {arrivals['datetime'].max()}")

# Afficher quelques arriv√©es
arrivals[['job_id', 'timestamp', 'datetime']].head(10)

In [None]:
# Compter les arriv√©es par diff√©rents intervalles de temps
arrivals_1min = arrivals.set_index('datetime').resample('1min').size()
arrivals_10min = arrivals.set_index('datetime').resample('10min').size()
arrivals_1h = arrivals.set_index('datetime').resample('1H').size()

print("Statistiques des arriv√©es:\n")

print("Par minute:")
print(f"  Moyenne: {arrivals_1min.mean():.2f}")
print(f"  M√©diane: {arrivals_1min.median():.2f}")
print(f"  Max:     {arrivals_1min.max():.0f}")

print("\nPar 10 minutes:")
print(f"  Moyenne: {arrivals_10min.mean():.2f}")
print(f"  M√©diane: {arrivals_10min.median():.2f}")
print(f"  Max:     {arrivals_10min.max():.0f}")

print("\nPar heure:")
print(f"  Moyenne: {arrivals_1h.mean():.2f}")
print(f"  M√©diane: {arrivals_1h.median():.2f}")
print(f"  Max:     {arrivals_1h.max():.0f}")

In [None]:
# Visualisation des arriv√©es
fig, axes = plt.subplots(3, 1, figsize=(16, 12))

# Par minute
ax1 = axes[0]
arrivals_1min.plot(ax=ax1, linewidth=0.8, color='steelblue', alpha=0.7)
ax1.set_title('Arriv√©es de jobs par minute', fontsize=13, fontweight='bold')
ax1.set_ylabel('Nombre de jobs')
ax1.grid(True, alpha=0.3)
ax1.axhline(arrivals_1min.mean(), color='red', linestyle='--', 
           label=f'Moyenne: {arrivals_1min.mean():.1f}', alpha=0.7)
ax1.legend()

# Par 10 minutes
ax2 = axes[1]
arrivals_10min.plot(ax=ax2, linewidth=1.2, color='coral', alpha=0.8)
ax2.set_title('Arriv√©es de jobs par 10 minutes', fontsize=13, fontweight='bold')
ax2.set_ylabel('Nombre de jobs')
ax2.grid(True, alpha=0.3)
ax2.axhline(arrivals_10min.mean(), color='red', linestyle='--',
           label=f'Moyenne: {arrivals_10min.mean():.1f}', alpha=0.7)
ax2.legend()

# Par heure
ax3 = axes[2]
arrivals_1h.plot(ax=ax3, linewidth=1.5, color='mediumseagreen', alpha=0.8, marker='o', markersize=4)
ax3.set_title('Arriv√©es de jobs par heure', fontsize=13, fontweight='bold')
ax3.set_xlabel('Temps')
ax3.set_ylabel('Nombre de jobs')
ax3.grid(True, alpha=0.3)
ax3.axhline(arrivals_1h.mean(), color='red', linestyle='--',
           label=f'Moyenne: {arrivals_1h.mean():.1f}', alpha=0.7)
ax3.legend()

plt.tight_layout()
plt.savefig('results/figures/arrivals_temporal_series.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: results/figures/arrivals_temporal_series.png")

### 5.2 Histogramme des arriv√©es

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogramme par minute
ax1 = axes[0]
arrivals_1min[arrivals_1min > 0].hist(bins=50, ax=ax1, color='steelblue', 
                                       edgecolor='black', alpha=0.7)
ax1.set_title('Distribution des arriv√©es par minute', fontsize=13, fontweight='bold')
ax1.set_xlabel('Nombre d\'arriv√©es')
ax1.set_ylabel('Fr√©quence')
ax1.grid(True, alpha=0.3, axis='y')
ax1.axvline(arrivals_1min.mean(), color='red', linestyle='--', 
           label=f'Moyenne: {arrivals_1min.mean():.1f}', linewidth=2)
ax1.legend()

# Histogramme par heure
ax2 = axes[1]
arrivals_1h[arrivals_1h > 0].hist(bins=30, ax=ax2, color='coral',
                                  edgecolor='black', alpha=0.7)
ax2.set_title('Distribution des arriv√©es par heure', fontsize=13, fontweight='bold')
ax2.set_xlabel('Nombre d\'arriv√©es')
ax2.set_ylabel('Fr√©quence')
ax2.grid(True, alpha=0.3, axis='y')
ax2.axvline(arrivals_1h.mean(), color='red', linestyle='--',
           label=f'Moyenne: {arrivals_1h.mean():.1f}', linewidth=2)
ax2.legend()

plt.tight_layout()
plt.savefig('results/figures/arrivals_histograms.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: results/figures/arrivals_histograms.png")

### 5.3 Pattern journalier et hebdomadaire

In [None]:
# Extraire les composantes temporelles
arrivals['hour'] = arrivals['datetime'].dt.hour
arrivals['day_of_week'] = arrivals['datetime'].dt.dayofweek
arrivals['date'] = arrivals['datetime'].dt.date

# Pattern horaire
hourly_pattern = arrivals.groupby('hour').size()

# Pattern hebdomadaire
days = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']
weekly_pattern = arrivals.groupby('day_of_week').size()

print("Pattern journalier (par heure):\n")
for hour, count in hourly_pattern.items():
    print(f"  {hour:2d}h: {count:>6,} arriv√©es")

print("\nPattern hebdomadaire:\n")
for day_idx, count in weekly_pattern.items():
    print(f"  {days[day_idx]:10s}: {count:>6,} arriv√©es")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Pattern horaire
ax1 = axes[0]
hourly_pattern.plot(kind='bar', ax=ax1, color='steelblue', 
                    edgecolor='black', width=0.8)
ax1.set_title('Pattern journalier - Arriv√©es par heure', 
             fontsize=13, fontweight='bold')
ax1.set_xlabel('Heure de la journ√©e')
ax1.set_ylabel('Nombre total d\'arriv√©es')
ax1.set_xticklabels(range(24), rotation=0)
ax1.grid(True, alpha=0.3, axis='y')
ax1.axhline(hourly_pattern.mean(), color='red', linestyle='--',
           label=f'Moyenne: {hourly_pattern.mean():.0f}', alpha=0.7)
ax1.legend()

# Pattern hebdomadaire
ax2 = axes[1]
weekly_pattern.plot(kind='bar', ax=ax2, color='coral',
                   edgecolor='black', width=0.8)
ax2.set_title('Pattern hebdomadaire - Arriv√©es par jour',
             fontsize=13, fontweight='bold')
ax2.set_xlabel('Jour de la semaine')
ax2.set_ylabel('Nombre total d\'arriv√©es')
ax2.set_xticklabels(days, rotation=45, ha='right')
ax2.grid(True, alpha=0.3, axis='y')
ax2.axhline(weekly_pattern.mean(), color='red', linestyle='--',
           label=f'Moyenne: {weekly_pattern.mean():.0f}', alpha=0.7)
ax2.legend()

plt.tight_layout()
plt.savefig('results/figures/arrivals_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: results/figures/arrivals_patterns.png")

### 5.4 Heatmap jour √ó heure

In [None]:
# Cr√©er une matrice jour √ó heure
daily_hourly = arrivals.groupby(['date', 'hour']).size().reset_index(name='count')
pivot_table = daily_hourly.pivot_table(values='count', 
                                       index='date', 
                                       columns='hour',
                                       fill_value=0)

# Limiter aux premiers jours pour lisibilit√©
n_days = min(30, len(pivot_table))
pivot_subset = pivot_table.iloc[:n_days]

# Heatmap
fig, ax = plt.subplots(figsize=(16, 10))

sns.heatmap(pivot_subset, 
           cmap='YlOrRd',
           ax=ax,
           cbar_kws={'label': 'Nombre d\'arriv√©es'},
           linewidths=0.5,
           linecolor='white')

ax.set_title(f'Heatmap des arriv√©es (jour √ó heure) - {n_days} premiers jours',
            fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Heure de la journ√©e', fontsize=12)
ax.set_ylabel('Date', fontsize=12)

plt.tight_layout()
plt.savefig('results/figures/arrivals_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: results/figures/arrivals_heatmap.png")

## 6. Analyse du cycle de vie des jobs

### 6.1 Tracer quelques jobs individuels

In [None]:
# Jobs qui ont plusieurs √©v√©nements
job_event_counts = df_sample.groupby('job_id').size()
jobs_with_multiple_events = job_event_counts[job_event_counts > 1].index[:10]

print(f"Jobs avec plusieurs √©v√©nements (10 premiers):\n")

for job_id in jobs_with_multiple_events:
    job_events = df_sample[df_sample['job_id'] == job_id].sort_values('timestamp')
    
    print(f"\nJob {job_id}:")
    for _, event in job_events.iterrows():
        event_name = EVENT_TYPES[event['event_type']]
        print(f"  {event['datetime']} - {event_name}")
    
    # Dur√©e totale
    duration = job_events['datetime'].max() - job_events['datetime'].min()
    print(f"  Dur√©e totale: {duration}")

### 6.2 Statistiques des transitions

In [None]:
# Analyser les transitions d'√©v√©nements
df_sorted = df_sample.sort_values(['job_id', 'timestamp'])
df_sorted['next_event'] = df_sorted.groupby('job_id')['event_type'].shift(-1)

# Supprimer les NaN (derniers √©v√©nements de chaque job)
transitions = df_sorted.dropna(subset=['next_event'])
transitions['next_event'] = transitions['next_event'].astype(int)

# Cr√©er une matrice de transition
transition_matrix = pd.crosstab(
    transitions['event_type'].map(EVENT_TYPES),
    transitions['next_event'].map(EVENT_TYPES),
    normalize='index'
) * 100

print("Matrice de transition (% de probabilit√©):\n")
print(transition_matrix.round(1))

In [None]:
# Heatmap des transitions
fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(transition_matrix,
           annot=True,
           fmt='.1f',
           cmap='YlGnBu',
           ax=ax,
           cbar_kws={'label': 'Probabilit√© (%)'},
           linewidths=1,
           linecolor='white')

ax.set_title('Matrice de transition des √©v√©nements',
            fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('√âv√©nement suivant', fontsize=12)
ax.set_ylabel('√âv√©nement actuel', fontsize=12)

plt.tight_layout()
plt.savefig('results/figures/transition_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: results/figures/transition_matrix.png")

## 7. Chargement de plusieurs fichiers

In [None]:
def load_multiple_files(files, max_files=10, rows_per_file=50000):
    """
    Charge plusieurs fichiers et consolide.
    """
    all_data = []
    
    print(f"Chargement de {max_files} fichiers ({rows_per_file:,} lignes par fichier)...\n")
    
    for i, filepath in enumerate(files[:max_files], 1):
        print(f"[{i}/{max_files}] {filepath.name}")
        
        df = load_sample(filepath, n_rows=rows_per_file)
        
        if not df.empty:
            all_data.append(df)
            print(f"  ‚úì {len(df):,} lignes charg√©es")
    
    # Consolider
    print("\nConsolidation...")
    result = pd.concat(all_data, ignore_index=True)
    
    # Convertir timestamps
    result['datetime'] = pd.to_datetime(result['timestamp'], unit='us', errors='coerce')
    result['event_name'] = result['event_type'].map(EVENT_TYPES)
    
    print(f"\n‚úÖ Total: {len(result):,} √©v√©nements")
    print(f"   Jobs uniques: {result['job_id'].nunique():,}")
    print(f"   P√©riode: {result['datetime'].min()} ‚Üí {result['datetime'].max()}")
    
    return result

# Charger 10 fichiers
df_large = load_multiple_files(files, max_files=10, rows_per_file=50000)

In [None]:
# Statistiques sur le dataset √©largi
print("Statistiques du dataset √©largi:\n")
print("="*60)
print(f"Total √©v√©nements:     {len(df_large):>12,}")
print(f"Jobs uniques:         {df_large['job_id'].nunique():>12,}")
print(f"P√©riode:              {df_large['datetime'].min()}")
print(f"                   ‚Üí {df_large['datetime'].max()}")
print(f"Dur√©e:                {df_large['datetime'].max() - df_large['datetime'].min()}")
print("="*60)

print("\nDistribution des √©v√©nements:\n")
event_dist = df_large['event_name'].value_counts()
for event_name, count in event_dist.items():
    pct = count / len(df_large) * 100
    print(f"  {event_name:20s}: {count:>12,} ({pct:>5.2f}%)")

## 8. Sauvegarde des arriv√©es pour analyse ult√©rieure

In [None]:
# Extraire et sauvegarder les arriv√©es
arrivals_large = df_large[df_large['event_type'] == 0].copy()
arrivals_large = arrivals_large.sort_values('timestamp')

# Sauvegarder
output_path = 'data/processed/2011_arrivals_sample.csv'
arrivals_large[['job_id', 'timestamp', 'datetime']].to_csv(output_path, index=False)

print(f"‚úÖ Arriv√©es sauvegard√©es: {output_path}")
print(f"   {len(arrivals_large):,} arriv√©es")
print(f"   {arrivals_large['job_id'].nunique():,} jobs uniques")

## 9. R√©sum√© et conclusions

In [None]:
print("="*70)
print("R√âSUM√â DE L'EXPLORATION")
print("="*70)

print("\nüìä Dataset:")
print(f"  ‚Ä¢ Fichiers analys√©s: {len(files)} disponibles")
print(f"  ‚Ä¢ √âv√©nements charg√©s: {len(df_large):,}")
print(f"  ‚Ä¢ Jobs uniques: {df_large['job_id'].nunique():,}")

print("\nüì• Arriv√©es (SUBMIT):")
arrivals_stats = arrivals_large.set_index('datetime').resample('1H').size()
print(f"  ‚Ä¢ Total arriv√©es: {len(arrivals_large):,}")
print(f"  ‚Ä¢ Moyenne/heure: {arrivals_stats.mean():.2f}")
print(f"  ‚Ä¢ Max/heure: {arrivals_stats.max():.0f}")

print("\nüéØ Types d'√©v√©nements:")
for event_name in ['SUBMIT', 'FINISH', 'FAIL', 'EVICT']:
    count = (df_large['event_name'] == event_name).sum()
    pct = count / len(df_large) * 100
    print(f"  ‚Ä¢ {event_name:10s}: {count:>10,} ({pct:>5.2f}%)")

print("\nüíæ Fichiers g√©n√©r√©s:")
print("  ‚Ä¢ results/figures/event_type_distribution.png")
print("  ‚Ä¢ results/figures/arrivals_temporal_series.png")
print("  ‚Ä¢ results/figures/arrivals_histograms.png")
print("  ‚Ä¢ results/figures/arrivals_patterns.png")
print("  ‚Ä¢ results/figures/arrivals_heatmap.png")
print("  ‚Ä¢ results/figures/transition_matrix.png")
print("  ‚Ä¢ data/processed/2011_arrivals_sample.csv")

print("\nüöÄ Prochaines √©tapes:")
print("  1. Charger tous les fichiers (500) pour dataset complet")
print("  2. Cr√©er les s√©quences temporelles pour le VAE")
print("  3. Analyser l'incertitude (bo√Ætes, polytopes, Wasserstein)")
print("  4. Entra√Æner le mod√®le VAE-LSTM")
print("="*70)