# Fichier consrtuisant les figures du rapport 

> R√©dig√© par Yannis CHUPIN et Nathan Bourdere Andreou

## 1. Test avec un √©chantillon √† 10% pr√©lev√© avec la m√©thode `"vertex-cut"`

In [None]:
#!/usr/bin/env python3
"""
Notebook d'analyse des r√©sultats PageRank
G√©n√®re des visualisations et statistiques pour le rapport
"""

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
from collections import defaultdict
import glob

# Configuration du style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# ==========================================
# FONCTIONS UTILITAIRES
# ==========================================

def load_json_file(filepath):
    """Charge un fichier JSON"""
    try:
        with open(filepath, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"‚ö†Ô∏è  Erreur lors du chargement de {filepath}: {e}")
        return None

def group_files_by_prefix(directory, extension="json"):
    """
    Groupe les fichiers par pr√©fixe (sans le dernier caract√®re)
    Ex: file1.json, file2.json, file3.json -> groupe "file"
    """
    files = list(Path(directory).rglob(f"*.{extension}"))
    
    groups = defaultdict(list)
    for file in files:
        # Retirer l'extension et le dernier caract√®re pour grouper
        stem = file.stem  # nom sans extension
        if len(stem) > 0:
            # V√©rifier si le dernier caract√®re est un chiffre
            if stem[-1].isdigit():
                prefix = stem[:-1]  # Tout sauf le dernier caract√®re
                groups[prefix].append(file)
            else:
                # Si pas de chiffre, consid√©rer comme un groupe √† un seul √©l√©ment
                groups[stem].append(file)
    
    # Trier les fichiers dans chaque groupe
    for prefix in groups:
        groups[prefix].sort()
    
    return dict(groups)

def load_metadata_group(filepaths):
    """Charge un groupe de fichiers metadata et retourne une liste de dicts"""
    metadata_list = []
    for filepath in filepaths:
        data = load_json_file(filepath)
        if data:
            data['source_file'] = filepath.name
            metadata_list.append(data)
    return metadata_list

def load_csv_with_fallback(filepath):
    """Charge un CSV avec gestion d'erreurs"""
    try:
        # Essayer avec header
        df = pd.read_csv(filepath)
        if df.shape[1] == 1:  # Pas de header d√©tect√©
            df = pd.read_csv(filepath, header=None, names=['node', 'rank'])
        return df
    except Exception as e:
        print(f"‚ö†Ô∏è  Erreur lors du chargement de {filepath}: {e}")
        return None

# ==========================================
# VISUALISATIONS
# ==========================================

def plot_execution_times(groups_data, title="Temps d'ex√©cution total"):
    """
    Boxplot des temps d'ex√©cution par groupe
    """
    fig, ax = plt.subplots(figsize=(14, 6))
    
    data_for_plot = []
    labels = []
    
    for group_name, metadata_list in groups_data.items():
        times = [m['total_time_seconds'] for m in metadata_list]
        data_for_plot.append(times)
        labels.append(group_name)
    
    bp = ax.boxplot(data_for_plot, labels=labels, patch_artist=True,
                    showmeans=True, meanline=True)
    
    # Colorier les boxplots
    colors = plt.cm.Set3(np.linspace(0, 1, len(data_for_plot)))
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    
    ax.set_ylabel('Temps (secondes)')
    ax.set_xlabel('Configuration')
    ax.set_title(title)
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    return fig

def plot_iteration_times_candlestick(groups_data, title="Temps par it√©ration (Candlestick)"):
    """
    Candlestick chart pour les temps d'it√©ration
    """
    fig, axes = plt.subplots(len(groups_data), 1, 
                             figsize=(14, 4 * len(groups_data)), 
                             squeeze=False)
    
    for idx, (group_name, metadata_list) in enumerate(groups_data.items()):
        ax = axes[idx, 0]
        
        # Calculer statistiques par it√©ration
        num_iterations = len(metadata_list[0]['iteration_times'])
        iterations = range(1, num_iterations + 1)
        
        means = []
        mins = []
        maxs = []
        q1s = []
        q3s = []
        
        for i in range(num_iterations):
            times = [m['iteration_times'][i] for m in metadata_list]
            means.append(np.mean(times))
            mins.append(np.min(times))
            maxs.append(np.max(times))
            q1s.append(np.percentile(times, 25))
            q3s.append(np.percentile(times, 75))
        
        # Plot candlestick-like chart
        for i, iter_num in enumerate(iterations):
            # Ligne min-max
            ax.plot([iter_num, iter_num], [mins[i], maxs[i]], 
                   'k-', linewidth=1, alpha=0.5)
            # Bo√Æte Q1-Q3
            height = q3s[i] - q1s[i]
            ax.bar(iter_num, height, bottom=q1s[i], width=0.6, 
                  alpha=0.6, color='steelblue', edgecolor='black')
            # Mean
            ax.plot(iter_num, means[i], 'ro', markersize=6)
        
        ax.set_xlabel('It√©ration')
        ax.set_ylabel('Temps (secondes)')
        ax.set_title(f'{group_name} - Temps par it√©ration')
        ax.grid(True, alpha=0.3)
        ax.set_xticks(iterations)
    
    plt.tight_layout()
    return fig

def plot_metrics_comparison(groups_data, title="Comparaison des m√©triques"):
    """
    Barplot comparant plusieurs m√©triques entre groupes
    """
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))
    
    metrics = [
        ('num_nodes', 'Nombre de n≈ìuds', axes[0, 0]),
        ('num_links', 'Nombre de liens', axes[0, 1]),
        ('total_time_seconds', 'Temps total (s)', axes[1, 0]),
        ('avg_iteration_time_seconds', 'Temps moyen/it√©ration (s)', axes[1, 1])
    ]
    
    for metric_key, metric_label, ax in metrics:
        data_for_plot = []
        labels = []
        errors = []
        
        for group_name, metadata_list in groups_data.items():
            values = [m.get(metric_key, 0) for m in metadata_list]
            data_for_plot.append(np.mean(values))
            labels.append(group_name)
            errors.append(np.std(values))
        
        x_pos = np.arange(len(labels))
        bars = ax.bar(x_pos, data_for_plot, yerr=errors, 
                     capsize=5, alpha=0.7, color='steelblue', edgecolor='black')
        
        # Ajouter les valeurs sur les barres
        for i, (bar, val) in enumerate(zip(bars, data_for_plot)):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{val:.2f}',
                   ha='center', va='bottom', fontsize=9)
        
        ax.set_ylabel(metric_label)
        ax.set_xlabel('Configuration')
        ax.set_title(metric_label)
        ax.set_xticks(x_pos)
        ax.set_xticklabels(labels, rotation=45, ha='right')
        ax.grid(True, alpha=0.3, axis='y')
    
    plt.suptitle(title, fontsize=14, y=1.00)
    plt.tight_layout()
    return fig

def plot_top_pages(csv_groups, top_n=20, title="Top Pages par PageRank"):
    """
    Barplot horizontal des top pages pour chaque groupe
    """
    num_groups = len(csv_groups)
    fig, axes = plt.subplots(num_groups, 1, 
                             figsize=(14, 6 * num_groups),
                             squeeze=False)
    
    for idx, (group_name, csv_files) in enumerate(csv_groups.items()):
        ax = axes[idx, 0]
        
        # Charger et moyenner les ranks
        all_ranks = []
        for csv_file in csv_files:
            df = load_csv_with_fallback(csv_file)
            if df is not None:
                all_ranks.append(df)
        
        if not all_ranks:
            continue
        
        # Merger et calculer la moyenne des ranks
        merged = pd.concat(all_ranks).groupby('node')['rank'].mean().reset_index()
        merged = merged.sort_values('rank', ascending=False).head(top_n)
        
        # Plot
        colors = plt.cm.viridis(np.linspace(0, 1, len(merged)))
        bars = ax.barh(range(len(merged)), merged['rank'], color=colors)
        ax.set_yticks(range(len(merged)))
        ax.set_yticklabels(merged['node'], fontsize=9)
        ax.set_xlabel('PageRank Score')
        ax.set_title(f'{group_name} - Top {top_n} Pages')
        ax.invert_yaxis()
        ax.grid(True, alpha=0.3, axis='x')
        
        # Ajouter les valeurs
        for i, (bar, val) in enumerate(zip(bars, merged['rank'])):
            width = bar.get_width()
            ax.text(width, bar.get_y() + bar.get_height()/2.,
                   f' {val:.4f}',
                   ha='left', va='center', fontsize=8)
    
    plt.tight_layout()
    return fig

def plot_rank_distribution(csv_groups, title="Distribution des PageRank"):
    """
    Histogrammes et violin plots de la distribution des ranks
    """
    num_groups = len(csv_groups)
    fig, axes = plt.subplots(num_groups, 2, 
                             figsize=(16, 5 * num_groups),
                             squeeze=False)
    
    for idx, (group_name, csv_files) in enumerate(csv_groups.items()):
        # Charger tous les ranks
        all_ranks = []
        for csv_file in csv_files:
            df = load_csv_with_fallback(csv_file)
            if df is not None:
                all_ranks.extend(df['rank'].values)
        
        if not all_ranks:
            continue
        
        # Histogram (√©chelle log)
        ax1 = axes[idx, 0]
        ax1.hist(all_ranks, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
        ax1.set_xlabel('PageRank Score')
        ax1.set_ylabel('Fr√©quence')
        ax1.set_title(f'{group_name} - Distribution (Histogramme)')
        ax1.set_yscale('log')
        ax1.grid(True, alpha=0.3)
        
        # Box plot
        ax2 = axes[idx, 1]
        bp = ax2.boxplot([all_ranks], vert=True, patch_artist=True,
                         labels=[group_name], showmeans=True)
        bp['boxes'][0].set_facecolor('lightblue')
        ax2.set_ylabel('PageRank Score')
        ax2.set_title(f'{group_name} - Distribution (BoxPlot)')
        ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

def generate_summary_table(groups_data):
    """
    G√©n√®re un DataFrame r√©sum√© avec statistiques par groupe
    """
    summary = []
    
    for group_name, metadata_list in groups_data.items():
        num_runs = len(metadata_list)
        
        times = [m['total_time_seconds'] for m in metadata_list]
        nodes = [m['num_nodes'] for m in metadata_list]
        links = [m['num_links'] for m in metadata_list]
        avg_iter_times = [m['avg_iteration_time_seconds'] for m in metadata_list]
        
        summary.append({
            'Configuration': group_name,
            'Nombre de runs': num_runs,
            'N≈ìuds': f"{np.mean(nodes):.0f} ¬± {np.std(nodes):.0f}",
            'Liens': f"{np.mean(links):.0f} ¬± {np.std(links):.0f}",
            'Temps total (s)': f"{np.mean(times):.2f} ¬± {np.std(times):.2f}",
            'Temps moyen/iter (s)': f"{np.mean(avg_iter_times):.2f} ¬± {np.std(avg_iter_times):.2f}",
            'Min temps (s)': f"{np.min(times):.2f}",
            'Max temps (s)': f"{np.max(times):.2f}"
        })
    
    return pd.DataFrame(summary)

In [None]:
# ==========================================
# CONFIGURATION
# ==========================================

# R√©pertoire contenant les r√©sultats
RESULTS_DIR = "./sample/"  # Adapter selon votre structure


# ==========================================
# MAIN EXECUTION
# ==========================================

def main():
    print("=" * 60)
    print("ANALYSE DES R√âSULTATS PAGERANK")
    print("=" * 60)
    
    # 1. D√©tecter et grouper les fichiers JSON
    print(f"\nüìÇ Recherche des fichiers dans : {RESULTS_DIR}")
    json_groups = group_files_by_prefix(RESULTS_DIR, "json")
    print(json_groups)
    
    print(f"\n‚úÖ {len(json_groups)} groupes de m√©tadonn√©es d√©tect√©s :")
    for group_name, files in json_groups.items():
        print(f"   - {group_name}: {len(files)} fichier(s)")
    
    # 2. Charger les m√©tadonn√©es
    print("\nüìä Chargement des m√©tadonn√©es...")
    groups_data = {}
    for group_name, files in json_groups.items():
        metadata_list = load_metadata_group(files)
        if metadata_list:
            groups_data[group_name] = metadata_list
            print(f"   ‚úì {group_name}: {len(metadata_list)} run(s) charg√©(s)")
    
    if not groups_data:
        print("‚ùå Aucune m√©tadonn√©e valide trouv√©e !")
        return
    
    # 3. D√©tecter et grouper les fichiers CSV
    print(f"\nüìÇ Recherche des CSV...")
    csv_groups = group_files_by_prefix(RESULTS_DIR, "csv")
    print(f"‚úÖ {len(csv_groups)} groupes de CSV d√©tect√©s")
    
    # 4. G√©n√©rer le tableau r√©capitulatif
    print("\nüìã G√©n√©ration du tableau r√©capitulatif...")
    summary_df = generate_summary_table(groups_data)
    print("\n" + "=" * 60)
    print("TABLEAU R√âCAPITULATIF")
    print("=" * 60)
    print(summary_df.to_string(index=False))
    print("=" * 60)
    
    # 5. Sauvegarder le tableau
    summary_df.to_csv(f"{RESULTS_DIR}/summary_table.csv", index=False)
    print(f"\nüíæ Tableau sauvegard√© : {RESULTS_DIR}/summary_table.csv")
    
    # 6. G√©n√©rer les visualisations
    print("\nüìä G√©n√©ration des visualisations...")
    
    figures = []
    
    # Temps d'ex√©cution
    fig1 = plot_execution_times(groups_data, 
                                title="Temps d'ex√©cution total par configuration")
    fig1.savefig(f"{RESULTS_DIR}/plot_execution_times.png", dpi=300, bbox_inches='tight')
    figures.append(("Temps d'ex√©cution", "plot_execution_times.png"))
    print("   ‚úì plot_execution_times.png")
    
    # Candlestick des it√©rations
    fig2 = plot_iteration_times_candlestick(groups_data,
                                           title="Temps par it√©ration (Candlestick)")
    fig2.savefig(f"{RESULTS_DIR}/plot_iteration_candlestick.png", dpi=300, bbox_inches='tight')
    figures.append(("Temps par it√©ration", "plot_iteration_candlestick.png"))
    print("   ‚úì plot_iteration_candlestick.png")
    
    # Comparaison des m√©triques
    fig3 = plot_metrics_comparison(groups_data,
                                  title="Comparaison des m√©triques entre configurations")
    fig3.savefig(f"{RESULTS_DIR}/plot_metrics_comparison.png", dpi=300, bbox_inches='tight')
    figures.append(("M√©triques", "plot_metrics_comparison.png"))
    print("   ‚úì plot_metrics_comparison.png")
    
    # Top pages (si CSV disponibles)
    if csv_groups:
        fig4 = plot_top_pages(csv_groups, top_n=20,
                            title="Top 20 Pages par PageRank")
        fig4.savefig(f"{RESULTS_DIR}/plot_top_pages.png", dpi=300, bbox_inches='tight')
        figures.append(("Top Pages", "plot_top_pages.png"))
        print("   ‚úì plot_top_pages.png")
        
        # Distribution des ranks
        fig5 = plot_rank_distribution(csv_groups,
                                     title="Distribution des scores PageRank")
        fig5.savefig(f"{RESULTS_DIR}/plot_rank_distribution.png", dpi=300, bbox_inches='tight')
        figures.append(("Distribution", "plot_rank_distribution.png"))
        print("   ‚úì plot_rank_distribution.png")
    
    print(f"\n‚úÖ {len(figures)} visualisations g√©n√©r√©es dans : {RESULTS_DIR}/")
    print("\n" + "=" * 60)
    print("ANALYSE TERMIN√âE")
    print("=" * 60)
    
    # Afficher les figures (optionnel, commenter si en mode batch)
    # plt.show()

if __name__ == "__main__":
    main()