In [None]:
# -*- coding: utf-8 -*-
# Plotting dependency: matplotlib (do not install seaborn)
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import rcParams
# Keep font settings for possible Chinese display (if needed), can be removed if not required
rcParams['font.sans-serif'] = ['SimHei']
rcParams['axes.unicode_minus'] = False

# ============= Configuration Area =============
# Directory where your data is located: /mnt/data in this conversation environment
BASE_DIR = Path("./A1")  # For local running, you can change to Path(".") or your absolute path
# Keep DEFAULT_CASES for automatic scanning; otherwise, list them manually
DEFAULT_CASES = [
    "Conv_Case0", "Conv_Case1",
    "Matmul_Case0", "Matmul_Case1",
    "FlashAttention_Case0", "FlashAttention_Case1"
]

# Directory for output images and summary CSV
OUT_DIR = Path("./plots"); OUT_DIR.mkdir(exist_ok=True)

# ============= Utility Functions =============
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df

def pick(df: pd.DataFrame, names):
    """Match a column name from multiple possible names;
    if exact match fails, perform partial containment match."""
    names = [n.lower() for n in names]
    for n in names:
        if n in df.columns:
            return n
    for c in df.columns:
        for n in names:
            if n in c:
                return c
    raise KeyError(f"None of {names} found in columns: {df.columns.tolist()}")

def load_case(case_name: str, base: Path):
    nodes = normalize_cols(pd.read_csv(base / f"{case_name}_node_attributes.csv"))
    sched = normalize_cols(pd.read_csv(base / f"{case_name}_schedule_final.csv"))
    life  = normalize_cols(pd.read_csv(base / f"{case_name}_buf_lifecycle.csv"))
    return nodes, sched, life

def schedule_pos_map(sched: pd.DataFrame):
    node_col = pick(sched, ["nodeid","id","node_id"])
    order = sched[node_col].tolist()
    return {int(n): i for i, n in enumerate(order)}

def compute_peaks_and_lifetimes(nodes, sched, life):
    """Return: layered peak dictionary & life2 table containing size/lifetime"""
    id_col    = pick(nodes, ["id","nodeid","node_id"])
    op_col    = pick(nodes, ["op","opcode","operation"])
    buf_col   = pick(nodes, ["bufid","buf_id","buffer","buffer_id"])
    size_col  = pick(nodes, ["size","bytes"])
    cache_col = pick(nodes, ["type","cache_type","level","cachelevel"])

    alloc_nodes = nodes[nodes[op_col].str.upper()=="ALLOC"].copy()
    alloc_nodes[buf_col] = alloc_nodes[buf_col].astype(int)
    buf_map = (alloc_nodes
               .set_index(buf_col)[[size_col, cache_col, id_col]]
               .rename(columns={size_col:"size", cache_col:"cache", id_col:"alloc_node"}))

    pos = schedule_pos_map(sched)

    life_buf  = pick(life, ["bufid","buf_id"])
    life_alloc= pick(life, ["allocnode","alloc_node"])
    life_free = pick(life, ["freenode","free_node"])

    life2 = life.copy()
    life2[life_buf] = life2[life_buf].astype(int)
    life2["alloc_idx"] = life2[life_alloc].astype(int).map(pos)
    life2["free_idx"]  = life2[life_free].astype(int).map(pos)
    life2 = life2.merge(buf_map, left_on=life_buf, right_index=True, how="left")

    # Event scanning to calculate layered peaks
    events = {}
    for _, r in life2.iterrows():
        size = int(r["size"]) if pd.notna(r["size"]) else 0
        cache = str(r["cache"]) if pd.notna(r["cache"]) else "NA"
        ai = int(r["alloc_idx"]); fi = int(r["free_idx"])
        events.setdefault(cache, []).append((ai, +size))
        events.setdefault(cache, []).append((fi + 1e-6, -size))  # epsilon to make deallocation later than synchronous allocation

    peaks = {}
    for cache, evs in events.items():
        evs.sort(key=lambda x: x[0])
        cur = 0; peak = 0
        for _, delta in evs:
            cur += delta
            if cur > peak:
                peak = cur
        peaks[cache] = int(peak)

    # Buffer lifetime (number of steps)
    life2["lifetime_steps"] = life2["free_idx"] - life2["alloc_idx"] + 1
    return peaks, life2

def compute_op_counts(nodes):
    op_col = pick(nodes, ["op","opcode","operation"])
    ops = nodes[op_col].str.upper().value_counts().to_dict()
    exclude = {"ALLOC","FREE","COPY_IN","COPY_OUT","MOVE"}
    compute_ops = sum(cnt for k, cnt in ops.items() if k not in exclude)
    return {"COPY_IN": ops.get("COPY_IN",0),
            "COPY_OUT": ops.get("COPY_OUT",0),
            "MOVE": ops.get("MOVE",0),
            "COMPUTE_ops": compute_ops}

def residency_curve_by_level(nodes, sched, life):
    """Optional: Generate 'residency vs step curve' (total and layered).
    Return steps, total, {level: curve}"""
    peaks, life2 = compute_peaks_and_lifetimes(nodes, sched, life)
    pos = schedule_pos_map(sched)
    T = len(pos)  # Use scheduling steps as the x-axis
    levels = ["L1","L0A","L0B","L0C"]
    curves = {lv: np.zeros(T, dtype=np.int64) for lv in levels}
    total  = np.zeros(T, dtype=np.int64)

    for _, r in life2.iterrows():
        sz = int(r["size"]) if pd.notna(r["size"]) else 0
        if sz <= 0: continue
        ai = int(r["alloc_idx"]); fi = int(r["free_idx"])
        cache = str(r["cache"]) if pd.notna(r["cache"]) else "NA"
        sl = slice(ai, fi+1)
        total[sl] += sz
        if cache in curves:
            curves[cache][sl] += sz
    steps = np.arange(T)
    return steps, total, curves

# ============= Load Data & Calculate Metrics =============
cases = [c for c in DEFAULT_CASES
         if (BASE_DIR / f"{c}_node_attributes.csv").exists()
         and (BASE_DIR / f"{c}_schedule_final.csv").exists()
         and (BASE_DIR / f"{c}_buf_lifecycle.csv").exists()]

if not cases:
    raise FileNotFoundError(f"No target case files found in {BASE_DIR}, please check the path or file naming.")

summary_rows = []
lifetimes_by_case = {}
size_life_scatter = {}

for case in sorted(cases):
    nodes, sched, life = load_case(case, BASE_DIR)
    peaks, life2 = compute_peaks_and_lifetimes(nodes, sched, life)
    counts = compute_op_counts(nodes)

    peak_L1  = peaks.get("L1",0)
    peak_L0A = peaks.get("L0A",0)
    peak_L0B = peaks.get("L0B",0)
    peak_L0C = peaks.get("L0C",0)
    peak_total = peak_L1 + peak_L0A + peak_L0B + peak_L0C

    lifetimes_by_case[case] = life2["lifetime_steps"].dropna().astype(int).values
    size_life_scatter[case] = life2[["size","lifetime_steps"]].dropna().astype(float).values

    summary_rows.append({
        "case": case,
        "peak_total": peak_total,
        "peak_L1": peak_L1,
        "peak_L0A": peak_L0A,
        "peak_L0B": peak_L0B,
        "peak_L0C": peak_L0C,
        **counts
    })

summary = pd.DataFrame(summary_rows).sort_values("case").reset_index(drop=True)
summary.to_csv(OUT_DIR / "plots_summary_metrics.csv", index=False)
display(summary)

# ============= Plotting (Basic Version for Academic Papers) =============
# Note: According to platform requirements, use matplotlib for charts;
# each chart has a separate canvas; do not specify color styles.

# Figure 1: Total Peak Residency for Each Case
plt.figure(figsize=(8,5))
plt.bar(summary["case"], summary["peak_total"])
plt.xticks(rotation=30, ha="right")
plt.ylabel("Cache Peak (bytes)")
plt.title("Cache Status for Each Task")
plt.tight_layout()
plt.savefig(OUT_DIR / "plot_peak_total_by_case.png", dpi=200)
plt.show()

# Figure 2: Stacked Bar Chart of Peaks by Cache Level
cases_order = summary["case"].tolist()
L1  = summary.set_index("case").loc[cases_order, "peak_L1"].values
L0A = summary.set_index("case").loc[cases_order, "peak_L0A"].values
L0B = summary.set_index("case").loc[cases_order, "peak_L0B"].values
L0C = summary.set_index("case").loc[cases_order, "peak_L0C"].values

plt.figure(figsize=(8,5))
b1 = plt.bar(cases_order, L1, label="L1")
b2 = plt.bar(cases_order, L0A, bottom=L1, label="L0A")
b3 = plt.bar(cases_order, L0B, bottom=L1+L0A, label="L0B")
b4 = plt.bar(cases_order, L0C, bottom=L1+L0A+L0B, label="L0C")
plt.xticks(rotation=30, ha="right")
plt.ylabel("Cache Peak (bytes)")
plt.title("Peak Values at Different Cache Levels")
plt.legend()
plt.tight_layout()
plt.savefig(OUT_DIR / "plot_stacked_peaks_by_cache.png", dpi=200)
plt.show()

# Figure 3: Buffer Lifetime Distribution (Box Plot, Logarithmic Y-axis)
data = [lifetimes_by_case[c] for c in cases_order]
plt.figure(figsize=(8,5))
plt.boxplot(data, labels=cases_order, showfliers=False)
plt.yscale("log")
plt.ylabel("Lifetime (steps)")
plt.title("Buffer Lifetime Distribution")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.savefig(OUT_DIR / "plot_lifetime_boxplot.png", dpi=200)
plt.show()

# Figure 4: Operator Composition (Grouped Bar Chart)
cols = ["COPY_IN","COPY_OUT","MOVE","COMPUTE_ops"]
x = np.arange(len(cases_order)); width = 0.18
plt.figure(figsize=(9,5))
for i, col in enumerate(cols):
    plt.bar(x + i*width - 1.5*width,
            summary.set_index("case").loc[cases_order, col].values,
            width, label=col)
plt.xticks(x, cases_order, rotation=30, ha="right")
plt.ylabel("Count")
plt.title("Grouped Statistics of Various Operator Counts")
plt.legend(ncol=2)
plt.tight_layout()
plt.savefig(OUT_DIR / "plot_op_counts_grouped.png", dpi=200)
plt.show()

# Figure 5: Size-Lifetime Scatter Plot (Double Logarithmic), Matmul_Case1 as representative by default
rep = "Matmul_Case1" if "Matmul_Case1" in size_life_scatter else cases_order[-1]
xy = size_life_scatter[rep]
if len(xy) > 0:
    plt.figure(figsize=(6.5,5.5))
    plt.scatter(xy[:,0], xy[:,1], s=8, alpha=0.5)
    plt.xscale("log"); plt.yscale("log")
    plt.xlabel("Size (bytes, log)")
    plt.ylabel("Lifetime (steps, log)")
    plt.title(f"Scatter Plot of Size vs Lifetime: {rep}")
    plt.tight_layout()
    plt.savefig(OUT_DIR / f"plot_size_vs_lifetime_{rep}.png", dpi=200)
    plt.show()
else:
    print(f"[Tip] No valid size-lifetime data for {rep}, skipping scatter plot.")

print("\nGenerated files are saved in: ", OUT_DIR.resolve())
print("Including: plots_summary_metrics.csv, 5 PNG images.")

# ============= Optional: Residency vs Step Curve (Uncomment the following code block) =============
case_to_draw = "Matmul_Case1"  # Can be changed to other case names
nodes, sched, life = load_case(case_to_draw, BASE_DIR)
steps, total, curves = residency_curve_by_level(nodes, sched, life)
plt.figure(figsize=(9,4.5))
plt.plot(steps, total, label="Total")
for lv, arr in curves.items():
    if arr.sum() > 0:
        plt.plot(steps, arr, label=lv, linewidth=1)
plt.xlabel("Scheduling Steps"); plt.ylabel("Cache (bytes)")
plt.title(f"Cache vs Scheduling Steps: {case_to_draw}")
plt.legend(ncol=5)
plt.tight_layout()
plt.savefig(OUT_DIR / f"plot_residency_curve_{case_to_draw}.png", dpi=200)
plt.show()

In [None]:
# ================= Correlation Coefficient Heatmap =================
# Note: Only use matplotlib; no dependency on seaborn;
# automatically filter numerical columns and calculate Pearson correlation coefficients

def plot_corr_heatmap(df: pd.DataFrame, title: str, outfile: Path):
    """Calculate the correlation coefficient matrix for numerical columns of df and plot it as a heatmap"""
    num = df.select_dtypes(include=[np.number])
    if num.shape[1] < 2:
        print("[Tip] Fewer than 2 numerical columns available for correlation analysis, skipping plot generation.")
        return
    corr = num.corr(method="pearson")

    fig, ax = plt.subplots(figsize=(6.5, 5.5))
    im = ax.imshow(corr, aspect="auto", interpolation="nearest")  # Default color scheme, no manual specification
    ax.set_title(title)

    # Axis ticks and labels
    ax.set_xticks(np.arange(corr.shape[1]))
    ax.set_xticklabels(corr.columns, rotation=30, ha="right")
    ax.set_yticks(np.arange(corr.shape[0]))
    ax.set_yticklabels(corr.index)

    # Value annotation
    for i in range(corr.shape[0]):
        for j in range(corr.shape[1]):
            ax.text(j, i, f"{corr.iloc[i, j]:.2f}", ha="center", va="center")

    # Color bar
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    fig.tight_layout()
    plt.savefig(outfile, dpi=200)
    plt.show()

# 1) Correlation of "Summary Metrics" across tasks (Source: summary)
plot_corr_heatmap(
    df=summary,
    title="Correlation Coefficients of Summary Metrics Across Tasks",
    outfile=OUT_DIR / "heatmap_corr_summary.png"
)

# 2) Correlation of "Buffer Features" for a single task (size / lifetime / alloc_idx / free_idx)
def per_case_corr_heatmap(case_name: str):
    nodes, sched, life = load_case(case_name, BASE_DIR)
    _, life2 = compute_peaks_and_lifetimes(nodes, sched, life)
    df_case = life2[["size", "lifetime_steps", "alloc_idx", "free_idx"]].dropna()
    if len(df_case) < 3:
        print(f"[Tip] Insufficient valid samples for {case_name}, skipping.")
        return
    plot_corr_heatmap(
        df=df_case,
        title=f"Correlation Coefficients of Buffer Features for {case_name}",
        outfile=OUT_DIR / f"heatmap_corr_{case_name}.png"
    )

# Specify a representative task to output the heatmap (can be changed to the case name you want to view)
per_case_corr_heatmap("Matmul_Case1")

In [None]:
# -*- coding: utf-8 -*-
"""
Descriptive Statistics and Visualization for Problem 1 Data
- Automatically discover *_Nodes.csv and *_Edges.csv files
- Generate 6 plots: Node vs Edge Count Comparison, ALLOC vs FREE Count Comparison,
  ALLOC Size Box Plot, ALLOC Size Empirical Distribution, ALLOC Type Distribution,
  Global Op Distribution
- Export a summary table: problem1_data_summary.csv
"""

import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ========= Configurable Items =========
base_dir = "./CSV"                 # Directory containing 12 CSV files, e.g., "./CSV" or "/mnt/data"
out_dir  = "./plots_problem1_data"  # Directory for output images and tables
SAVE_PDF = False               # If True, also save PDF vector images simultaneously
# =====================================

# Attempt to set Chinese fonts to avoid garbled Chinese titles
try:
    from matplotlib import rcParams
    rcParams["font.sans-serif"] = ["SimHei", "Arial Unicode MS", "Noto Sans CJK SC"]
    rcParams["axes.unicode_minus"] = False
except Exception:
    pass

os.makedirs(out_dir, exist_ok=True)

# Discover files
node_files = sorted(glob.glob(os.path.join(base_dir, "*_Nodes.csv")))
edge_files = sorted(glob.glob(os.path.join(base_dir, "*_Edges.csv")))
assert len(node_files) == 6 and len(edge_files) == 6, "6 *_Nodes.csv and 6 *_Edges.csv should be found in base_dir"

def case_key_from_path(p):
    name = os.path.basename(p)
    return name.replace("_Nodes.csv", "").replace("_Edges.csv", "")

# Load data
cases = {}
for nf in node_files:
    key = case_key_from_path(nf)
    nodes = pd.read_csv(nf)
    cases.setdefault(key, {})["nodes"] = nodes

for ef in edge_files:
    key = case_key_from_path(ef)
    edges = pd.read_csv(ef)
    cases.setdefault(key, {})["edges"] = edges

# Summary statistics
summary_rows = []
alloc_size_by_case = {}
type_dist_all = []
op_dist_all = []

for key in sorted(cases.keys()):
    nodes = cases[key].get("nodes", pd.DataFrame())
    edges = cases[key].get("edges", pd.DataFrame())

    # Column name case insensitivity compatibility
    cols = {c.lower(): c for c in nodes.columns}
    op_col   = cols.get("op")
    buf_col  = cols.get("bufid")
    size_col = cols.get("size")
    type_col = cols.get("type")

    nodes_local = nodes.copy()
    if op_col is not None:
        nodes_local[op_col] = nodes_local[op_col].astype(str).str.upper()
    else:
        nodes_local["OP"] = "UNKNOWN"
        op_col = "OP"

    # Collect global distribution
    op_dist_all.append(nodes_local[op_col].value_counts())

    # Event counting
    is_alloc = nodes_local[op_col] == "ALLOC"
    is_free  = nodes_local[op_col] == "FREE"

    unique_bufids = 0
    if buf_col is not None:
        buf_series = pd.concat([
            nodes_local.loc[is_alloc, buf_col],
            nodes_local.loc[is_free, buf_col]
        ], ignore_index=True).dropna()
        unique_bufids = buf_series.nunique()

    # ALLOC Size statistics
    alloc_size_sum = None
    alloc_size_min = None
    alloc_size_max = None
    alloc_size_med = None
    alloc_size_p90 = None
    alloc_size_mean = None

    if size_col is not None:
        alloc_sizes = pd.to_numeric(nodes_local.loc[is_alloc, size_col], errors="coerce").dropna()
        if not alloc_sizes.empty:
            alloc_size_sum  = int(alloc_sizes.sum())
            alloc_size_min  = int(alloc_sizes.min())
            alloc_size_max  = int(alloc_sizes.max())
            alloc_size_med  = float(alloc_sizes.median())
            alloc_size_p90  = float(np.percentile(alloc_sizes, 90))
            alloc_size_mean = float(alloc_sizes.mean())
            alloc_size_by_case[key] = alloc_sizes.values

    # Type distribution
    if type_col is not None:
        type_dist_all.append(nodes_local.loc[is_alloc, type_col].astype(str).value_counts())

    # Edge counting
    edges_count = len(edges)

    summary_rows.append({
        "case": key,
        "nodes": len(nodes),
        "edges": edges_count,
        "alloc": int(is_alloc.sum()),
        "free": int(is_free.sum()),
        "unique_bufid": int(unique_bufids),
        "alloc_size_sum": alloc_size_sum,
        "alloc_size_min": alloc_size_min,
        "alloc_size_max": alloc_size_max,
        "alloc_size_median": None if alloc_size_med is None else round(alloc_size_med, 3),
        "alloc_size_p90": None if alloc_size_p90 is None else round(alloc_size_p90, 3),
        "alloc_size_mean": None if alloc_size_mean is None else round(alloc_size_mean, 3),
    })

summary_df = pd.DataFrame(summary_rows).sort_values("case").reset_index(drop=True)
summary_csv = os.path.join(out_dir, "problem1_data_summary.csv")
summary_df.to_csv(summary_csv, index=False)
print(f"Summary table saved to: {summary_csv}")
display(summary_df)

# ========== Plot 1: Node vs Edge Count Comparison ==========
plt.figure(figsize=(9, 5))
x = np.arange(len(summary_df))
width = 0.35
plt.bar(x - width/2, summary_df["nodes"], width, label="nodes")
plt.bar(x + width/2, summary_df["edges"], width, label="edges")
plt.xticks(x, summary_df["case"], rotation=20, ha="right")
plt.ylabel("count")
plt.title("Node vs Edge Count Comparison")
plt.legend()
plt.tight_layout()
fig1_png = os.path.join(out_dir, "desc_plot_nodes_edges.png")
plt.savefig(fig1_png, dpi=150)
if SAVE_PDF:
    plt.savefig(os.path.join(out_dir, "desc_plot_nodes_edges.pdf"))
plt.show()
print(f"Saved: {fig1_png}")

# ========== Plot 2: ALLOC vs FREE Count Comparison ==========
plt.figure(figsize=(9, 5))
plt.bar(x - width/2, summary_df["alloc"], width, label="alloc")
plt.bar(x + width/2, summary_df["free"], width, label="free")
plt.xticks(x, summary_df["case"], rotation=20, ha="right")
plt.ylabel("count")
plt.title("ALLOC vs FREE Event Count Comparison")
plt.legend()
plt.tight_layout()
fig2_png = os.path.join(out_dir, "desc_plot_alloc_free.png")
plt.savefig(fig2_png, dpi=150)
if SAVE_PDF:
    plt.savefig(os.path.join(out_dir, "desc_plot_alloc_free.pdf"))
plt.show()
print(f"Saved: {fig2_png}")

# ========== Plot 3: ALLOC Size Box Plot by Case ==========
box_data = []
labels = []
for key in summary_df["case"]:
    arr = alloc_size_by_case.get(key, np.array([]))
    if arr.size > 0:
        box_data.append(arr)
        labels.append(key)

if box_data:
    plt.figure(figsize=(9, 5))
    plt.boxplot(box_data, labels=labels, showmeans=True)
    plt.xticks(rotation=20, ha="right")
    plt.ylabel("Size")
    plt.title("ALLOC Size Distribution Box Plot by Case")
    plt.tight_layout()
    fig3_png = os.path.join(out_dir, "desc_plot_alloc_size_boxplot.png")
    plt.savefig(fig3_png, dpi=150)
    if SAVE_PDF:
        plt.savefig(os.path.join(out_dir, "desc_plot_alloc_size_boxplot.pdf"))
    plt.show()
    print(f"Saved: {fig3_png}")
else:
    print("No ALLOC Size data available for box plot")

# ========== Plot 4: ALLOC Size Empirical Cumulative Distribution Function (ECDF) by Case ==========
def ecdf(data):
    data = np.sort(np.asarray(data))
    y = np.arange(1, data.size + 1) / data.size
    return data, y

if box_data:
    plt.figure(figsize=(9, 5))
    for key in summary_df["case"]:
        arr = alloc_size_by_case.get(key, np.array([]))
        if arr.size > 0:
            x_vals, y_vals = ecdf(arr)
            plt.step(x_vals, y_vals, where="post", label=key)
    plt.xlabel("Size")
    plt.ylabel("ECDF")
    plt.title("ALLOC Size Empirical Cumulative Distribution Function")
    plt.legend()
    plt.tight_layout()
    fig4_png = os.path.join(out_dir, "desc_plot_alloc_size_ecdf.png")
    plt.savefig(fig4_png, dpi=150)
    if SAVE_PDF:
        plt.savefig(os.path.join(out_dir, "desc_plot_alloc_size_ecdf.pdf"))
    plt.show()
    print(f"Saved: {fig4_png}")
else:
    print("No ALLOC Size data available for ECDF plot")

# ========== Plot 5: Global ALLOC Type Distribution ==========
if type_dist_all:
    type_sum = pd.concat(type_dist_all, axis=1).fillna(0).sum(axis=1).sort_values(ascending=False)
    plt.figure(figsize=(9, 5))
    plt.bar(type_sum.index.astype(str), type_sum.values)
    plt.xticks(rotation=20, ha="right")
    plt.ylabel("count")
    plt.title("Cache Level Type Distribution of ALLOC Events")
    plt.tight_layout()
    fig5_png = os.path.join(out_dir, "desc_plot_type_dist_alloc.png")
    plt.savefig(fig5_png, dpi=150)
    if SAVE_PDF:
        plt.savefig(os.path.join(out_dir, "desc_plot_type_dist_alloc.pdf"))
    plt.show()
    print(f"Saved: {fig5_png}")
else:
    print("Type column not found or no ALLOC events available")

# ========== Plot 6: Global Node Op Type Distribution ==========
if op_dist_all:
    op_sum = pd.concat(op_dist_all, axis=1).fillna(0).sum(axis=1).sort_values(ascending=False)
    plt.figure(figsize=(9, 5))
    plt.bar(op_sum.index.astype(str), op_sum.values)
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("count")
    plt.title("Global Distribution of Node Op Types")
    plt.tight_layout()
    fig6_png = os.path.join(out_dir, "desc_plot_op_dist.png")
    plt.savefig(fig6_png, dpi=150)
    if SAVE_PDF:
        plt.savefig(os.path.join(out_dir, "desc_plot_op_dist.pdf"))
    plt.show()
    print(f"Saved: {fig6_png}")
else:
    print("Op column not found")

print("Completed. Output directory:", os.path.abspath(out_dir))

In [None]:
    import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from matplotlib import rcParams
rcParams['font.sans-serif'] = ['SimHei']
rcParams['axes.unicode_minus'] = False

# Paths
old_path = Path("./Problem1/Problem1_eval_summary.csv")
new_path = Path("./A1/所有案例调度结果汇总.csv")  # Keep original filename (file path is external to code logic)

# Load data
old = pd.read_csv(old_path)
new = pd.read_csv(new_path)

# Standardize case columns
old["case"] = old["task"].astype(str).str.strip()
new["case"] = new["案例名"].astype(str).str.strip()  # Keep original column name (from input CSV)

# Select needed columns
old_sub = old[["case","peak_user"]].rename(columns={"peak_user":"peak_old"})
new_sub = new[["case","最大缓存驻留容量"]].rename(columns={"最大缓存驻留容量":"peak_new"})  # Keep original column name (from input CSV)

# Merge datasets for comparison
cmp = pd.merge(old_sub, new_sub, on="case", how="inner")
cmp["reduction_bytes"] = (cmp["peak_old"] - cmp["peak_new"]).astype(int)
cmp["reduction_pct"] = np.where(cmp["peak_old"]>0, (cmp["reduction_bytes"]/cmp["peak_old"])*100.0, np.nan)

# Order by reduction percentage in descending order
cmp = cmp.sort_values("reduction_pct", ascending=False).reset_index(drop=True)

# Save comparison table
out_csv = "./old_vs_new_peak_comparison.csv"
cmp.to_csv(out_csv, index=False)

# Plot 1: Side-by-side comparison of old and new peaks
plt.figure(figsize=(9,5))
x = np.arange(len(cmp))
w = 0.35
plt.bar(x - w/2, cmp["peak_old"].values, width=w, label="Old peak")
plt.bar(x + w/2, cmp["peak_new"].values, width=w, label="New peak")
plt.xticks(x, cmp["case"].tolist(), rotation=30, ha="right")
plt.ylabel("Cache Peak (bytes)")
plt.title("")
plt.legend()
plt.tight_layout()
p1 = "./plot_peak_old_vs_new.png"
plt.savefig(p1, dpi=220)
plt.show()

# Plot 2: Peak reduction percentage
plt.figure(figsize=(9,5))
plt.bar(cmp["case"].tolist(), cmp["reduction_pct"].values)
plt.xticks(rotation=30, ha="right")
plt.ylabel("Optimization Ratio (%)")
plt.title("")
plt.tight_layout()
p2 = "./plot_peak_reduction_pct.png"
plt.savefig(p2, dpi=220)
plt.show()

# Plot 3: Peak reduction in bytes
plt.figure(figsize=(9,5))
plt.bar(cmp["case"].tolist(), cmp["reduction_bytes"].values)
plt.xticks(rotation=30, ha="right")
plt.ylabel("Reduction (bytes)")
plt.title("New vs Old: Peak reduction (bytes)")
plt.tight_layout()
p3 = "./plot_peak_reduction_bytes.png"
plt.savefig(p3, dpi=220)
plt.show()