In [49]:
import subprocess
import re
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid")

In [50]:
_DUR_RE = re.compile(r"(?P<val>[\d.]+)\s*(?P<unit>ns|µs|us|ms|s|m|h)")
_UNIT_TO_MS = {"ns": 1e-6, "µs": 1e-3, "us": 1e-3, "ms": 1, "s": 1000, "m": 60000, "h": 3600000}

def duration_to_ms(text: str) -> float:
    total_ms = 0.0
    for m in _DUR_RE.finditer(text):
        total_ms += float(m.group("val")) * _UNIT_TO_MS[m.group("unit")]
    return total_ms

In [51]:
SEQ_RE = re.compile(r"(?P<user1>\d+)\s*-\s*(?P<user2>\d+)\s*:\s*(?P<val>[-\d.]+)")
CON_RE = re.compile(r"(?P<user1>\d+)\s*-\s*(?P<user2>\d+)\s*:\s*(?P<val>[-\d.]+)")
TIME_RE = re.compile(r"(?:Sequential|Concurrent)\s+similarities\s+computed\s+in\s+(?P<duration>.+?)(?:\s+\(workers=\d+\))?,\s+pairs=\d+", re.I)

def run_algo(go_file: Path, alg: str, 
            sample_users: int = 0, sample_items: int = 0):
    cmd = [
        "go", "run", str(go_file),
        f"--algorithm={alg}",
        f"--sample_users={sample_users}",
        f"--sample_items={sample_items}",
    ]
    res = subprocess.run(cmd, capture_output=True, text=True, check=True)
    stdout = res.stdout.splitlines()

    seq_pairs, con_pairs = [], []
    seq_time, con_time = None, None
    in_seq_pairs = in_con_pairs = False

    for line in stdout:
        if "Top-10 similar user pairs (seq):" in line:
            in_seq_pairs = True
            in_con_pairs = False
            continue
        elif "Top-10 similar user pairs (con):" in line:
            in_seq_pairs = False
            in_con_pairs = True
            continue
        elif "Time" not in line and (in_seq_pairs or in_con_pairs):
            if m := SEQ_RE.search(line):
                pairs = seq_pairs if in_seq_pairs else con_pairs
                pairs.append({
                    "user1": int(m.group("user1")),
                    "user2": int(m.group("user2")),
                    "value": float(m.group("val"))
                })
        elif (m := TIME_RE.search(line)):
            ms = duration_to_ms(m.group("duration"))
            if "Sequential" in line:
                seq_time = ms
            else:
                con_time = ms

    records = []
    if seq_time is not None and seq_pairs:
        records.append({
            "algorithm": alg,
            "mode": "sequential",
            "value": np.mean([p["value"] for p in seq_pairs]),
            "duration_ms": seq_time,
            "top_pairs": seq_pairs
        })
    if con_time is not None and con_pairs:
        records.append({
            "algorithm": alg,
            "mode": "concurrent",
            "value": np.mean([p["value"] for p in con_pairs]),
            "duration_ms": con_time,
            "top_pairs": con_pairs
        })

    return records, "\n".join(stdout)

In [52]:
def benchmark(go_file, algs, goroutines_list, 
            sample_users=1000, sample_items=0, repeats=20):
    records = []

    for goroutines in goroutines_list:
        for alg in algs:
            for r in range(repeats):
                recs, _ = run_algo(
                    go_file=go_file,
                    alg=alg,
                    sample_users=sample_users,
                    sample_items=sample_items
                )
                for rec in recs:
                    rec["run"] = r
                    records.append(rec)
    return pd.DataFrame(records)

# Example usage:
df = benchmark(
    go_file=Path("tp.go"),
    algs=["cosine", "pearson"],  # Note: jaccard not supported in tp.go
    sample_users=1000,  # Limit to 1000 users as in the example output
    repeats=5  # Reduced repeats since we're reading from CSV
)

TypeError: benchmark() missing 1 required positional argument: 'goroutines_list'

In [None]:
summary = df.groupby(["algorithm", "mode", "dim", "goroutines"]).agg(
    mean_time=("duration_ms", "mean"),
    std_time=("duration_ms", "std"),
    mean_value=("value", "mean"),
).reset_index()

summary

In [None]:
g = sns.relplot(
    data=df,
    kind="line",
    x="goroutines", y="duration_ms",
    hue="mode", style="dim",
    col="algorithm", col_wrap=3,
    markers=True, dashes=False,
    errorbar=("sd")  # show variability from your repeats
)
g.set_axis_labels("Goroutines", "Duration (ms)")
g.fig.suptitle("Runtime vs Goroutines (mean ± sd per condition)", y=1.02)
plt.show()

In [None]:
# Build per-run pivot to pair seq/con, then average to be robust
per_run = df.pivot_table(
    index=["algorithm","dim","goroutines","run"],
    columns="mode",
    values="duration_ms",
    aggfunc="first"
).reset_index()

# Compute ratios per run
per_run["ratio_con_over_seq"] = per_run["concurrent"] / per_run["sequential"]
per_run["pct_con_of_seq"]     = 100 * per_run["ratio_con_over_seq"]
per_run["pct_improve"]        = 100 * (1 - per_run["ratio_con_over_seq"])

# Average across repeats for plotting
pct_summary = per_run.groupby(["algorithm","dim","goroutines"], as_index=False).agg(
    mean_pct_con_of_seq=("pct_con_of_seq","mean"),
    sd_pct_con_of_seq=("pct_con_of_seq","std"),
    mean_pct_improve=("pct_improve","mean"),
    sd_pct_improve=("pct_improve","std"),
)

# A) Concurrent as % of sequential (target < 100%)
g1 = sns.relplot(
    data=pct_summary, kind="line",
    x="goroutines", y="mean_pct_con_of_seq",
    hue="dim", col="algorithm", col_wrap=3,
    markers=True, dashes=False,
    errorbar=None
)
for ax, (alg) in zip(g1.axes.flatten(), pct_summary["algorithm"].unique()):
    ax.axhline(100, ls="--", lw=1, color="gray")  # baseline: equal speed
g1.set_axis_labels("Goroutines", "Concurrent as % of Sequential (↓ better)")
g1.fig.suptitle("Concurrent runtime relative to Sequential (mean across repeats)", y=1.02)
plt.show()

# B) % improvement (target > 0%)
g2 = sns.relplot(
    data=pct_summary, kind="line",
    x="goroutines", y="mean_pct_improve",
    hue="dim", col="algorithm", col_wrap=3,
    markers=True, dashes=False,
    errorbar=None
)
for ax in g2.axes.flatten():
    ax.axhline(0, ls="--", lw=1, color="gray")  # baseline: no improvement
g2.set_axis_labels("Goroutines", "Improvement of Concurrent vs Sequential (%) (↑ better)")
g2.fig.suptitle("Percentage improvement from concurrency (mean across repeats)", y=1.02)
plt.show()


In [None]:
# Analyze top pairs distribution
def analyze_top_pairs(df):
    all_pairs = []
    for _, row in df.iterrows():
        pairs = pd.DataFrame(row["top_pairs"])
        pairs["algorithm"] = row["algorithm"]
        pairs["mode"] = row["mode"]
        pairs["run"] = row["run"]
        pairs["goroutines"] = row["goroutines"]
        all_pairs.append(pairs)
    
    pairs_df = pd.concat(all_pairs, ignore_index=True)
    
    # Calculate pair frequency
    pair_counts = pairs_df.groupby(["algorithm", "mode", "user1", "user2"]).size().reset_index(name="frequency")
    pair_counts["pair"] = pair_counts.apply(lambda x: f"{x['user1']}-{x['user2']}", axis=1)
    
    # Plot top 20 most frequent pairs
    plt.figure(figsize=(15, 6))
    for (alg, mode), group in pair_counts.groupby(["algorithm", "mode"]):
        plt.subplot(1, 2, 1 if mode == "sequential" else 2)
        top_pairs = group.nlargest(20, "frequency")
        sns.barplot(data=top_pairs, x="frequency", y="pair")
        plt.title(f"{alg.title()} - {mode.title()}")
        plt.xlabel("Frequency in Top-10")
        plt.ylabel("User Pair")
    
    plt.tight_layout()
    plt.show()

analyze_top_pairs(df)