In [42]:
import subprocess
import re
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid")

In [43]:
_DUR_RE = re.compile(r"(?P<val>[\d.]+)\s*(?P<unit>ns|µs|us|ms|s|m|h)")
_UNIT_TO_MS = {"ns": 1e-6, "µs": 1e-3, "us": 1e-3, "ms": 1, "s": 1000, "m": 60000, "h": 3600000}

def duration_to_ms(text: str) -> float:
    total_ms = 0.0
    for m in _DUR_RE.finditer(text):
        total_ms += float(m.group("val")) * _UNIT_TO_MS[m.group("unit")]
    return total_ms

In [44]:
SEQ_RE   = re.compile(r"Sequential similarities computed in ([\d.]+(?:ns|µs|us|ms|s|m|h)).*pairs=(\d+)", re.I)
CON_RE   = re.compile(r"Concurrent similarities computed in ([\d.]+(?:ns|µs|us|ms|s|m|h)).*pairs=(\d+)", re.I)

def run_algo(go_file: Path, alg: str, sample_users: int, sample_items: int, num_cores: int, seed: int):
    cmd = [
        "go", "run", str(go_file),
        f"-algorithm={alg}",
        f"-sample_users={sample_users}",
        f"-sample_items={sample_items}",
        f"-num_cores={num_cores}",
        "-data=/home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/data/steam_reviews.csv"  # Adding the required data path
    ]
    print("\n" + "="*50)
    print(f"Running command: {' '.join(cmd)}")
    print("="*50)
    
    res = subprocess.run(cmd, capture_output=True, text=True, check=True)
    stdout = res.stdout.splitlines()

    print("\nCommand output:")
    print("-"*30)
    for line in stdout:
        print(line)
    print("-"*30)

    seq_time, con_time = None, None
    seq_pairs = con_pairs = None

    for line in stdout:
        if (m := SEQ_RE.search(line)):
            print(f"Found sequential timing: {m.group(1)}, pairs: {m.group(2)}")
            seq_time = duration_to_ms(m.group(1))
            seq_pairs = int(m.group(2))
        elif (m := CON_RE.search(line)):
            print(f"Found concurrent timing: {m.group(1)}, pairs: {m.group(2)}")
            con_time = duration_to_ms(m.group(1))
            con_pairs = int(m.group(2))

    records = []
    if seq_time is not None:
        rec = {
            "algorithm": alg, 
            "mode": "sequential", 
            "duration_ms": seq_time,
            "pairs": seq_pairs,
            "sample_users": sample_users,
            "sample_items": sample_items
        }
        print(f"\nAdding sequential record: {rec}")
        records.append(rec)
    if con_time is not None:
        rec = {
            "algorithm": alg, 
            "mode": "concurrent", 
            "duration_ms": con_time,
            "pairs": con_pairs,
            "sample_users": sample_users,
            "sample_items": sample_items
        }
        print(f"Adding concurrent record: {rec}")
        records.append(rec)

    print(f"\nReturning {len(records)} records")
    return records, "\n".join(stdout)

In [None]:
def benchmark(go_file, algs, sample_users_list, sample_items_list, num_cores_list, seed, repeats=20):
    records = []
    print(f"\nStarting benchmark with parameters:")
    print(f"  Algorithms: {algs}")
    print(f"  Sample users: {sample_users_list}")
    print(f"  Sample items: {sample_items_list}")
    print(f"  Num cores: {num_cores_list}")
    print(f"  Repeats: {repeats}\n")

    for sample_users in sample_users_list:
        for sample_items in sample_items_list:
            for num_cores in num_cores_list:
                for alg in algs:
                    for r in range(repeats):
                        try:
                            recs, _ = run_algo(
                                go_file, 
                                alg, 
                                sample_users=sample_users,
                                sample_items=sample_items,
                                num_cores=num_cores,
                                seed=seed + r
                            )
                            for rec in recs:
                                rec["run"] = r
                                rec["num_cores"] = num_cores
                                records.append(rec)
                        except subprocess.CalledProcessError as e:
                            print(f"Error running benchmark with parameters:")
                            print(f"  algorithm: {alg}")
                            print(f"  sample_users: {sample_users}")
                            print(f"  sample_items: {sample_items}")
                            print(f"  num_cores: {num_cores}")
                            print(f"  seed: {seed + r}")
                            print(f"Error output: {e.stderr}")
                            continue

    df = pd.DataFrame(records)
    print(f"\nFinal DataFrame shape: {df.shape}")
    if not df.empty:
        print("\nDataFrame head:")
        print(df.head())
    else:
        print("\nWarning: DataFrame is empty!")
    return df

# Run multiple experiments with different configurations:

# Experiment 1: Small dataset with many core configurations
print("\n=== Experiment 1: Small dataset, core scaling ===")
df1 = benchmark(
    go_file=Path("/home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/TP/tp.go"),
    algs=["cosine", "pearson"],
    sample_users_list=[1000],
    sample_items_list=[0],
    num_cores_list=[1, 2, 4, 8, 12, 16, 20, 24],  # testing wide range of cores
    seed=42,
    repeats=5
)

# Experiment 2: Core sweet spot with varying dataset sizes
print("\n=== Experiment 2: Dataset size scaling ===")
df2 = benchmark(
    go_file=Path("/home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/TP/tp.go"),
    algs=["cosine", "pearson"],
    sample_users_list=[1000, 2000, 5000, 10000, 20000],  # testing dataset scaling
    sample_items_list=[0],
    num_cores_list=[8, 12, 16],  # using best performing cores from exp 1
    seed=42,
    repeats=3
)

# Experiment 3: Focused run on best configurations
print("\n=== Experiment 3: Best configurations with more repeats ===")
df3 = benchmark(
    go_file=Path("/home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/TP/tp.go"),
    algs=["cosine", "pearson"],
    sample_users_list=[5000, 10000],  # medium-sized datasets
    sample_items_list=[0],
    num_cores_list=[12, 16],  # best performing core counts
    seed=42,
    repeats=10  # more repeats for statistical significance
)

# Combine all results
df = pd.concat([df1, df2, df3], ignore_index=True)
print("\n=== Final combined dataset ===")
print(f"Total number of runs: {len(df)}")
print("\nSample of results:")
print(df.head())


=== Experiment 1: Small dataset, core scaling ===

Starting benchmark with parameters:
  Algorithms: ['cosine', 'pearson']
  Sample users: [1000]
  Sample items: [0]
  Num cores: [1, 2, 4, 8, 12, 16, 20, 24]
  Repeats: 5


Running command: go run /home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/TP/tp.go -algorithm=cosine -sample_users=1000 -sample_items=0 -num_cores=1 -data=/home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/data/steam_reviews.csv

Command output:
------------------------------
Reading dataset /home/giorgio6846/Code/Clases/Programacion-CyD/PCyD-TP/data/steam_reviews.csv (sample users=1000 items=0)
Users=1000 Items=315
Train users=1000  Test cases=303
Computing user-user similarities...
Sequential similarities computed in 48.453941ms, pairs=11427
Top-10 similar user pairs (seq):
76561199015514788 - 76561199035823667 : 1.0000
76561198155516904 - 76561198218732443 : 1.0000
76561198015892651 - 76561198381651978 : 1.0000
76561198097540153 - 76561198276713672 : 1.0

In [None]:
df.head()

'  algorithm        mode  duration_ms  pairs  sample_users  sample_items  run  \\\n0    cosine  sequential    48.313612  13387          1000             0    0   \n1    cosine  concurrent    64.710182  13387          1000             0    0   \n\n   num_cores  \n0         10  \n1         10  '

In [None]:
# Create detailed summary statistics
summary = df.groupby(["algorithm", "mode", "sample_users", "num_cores"]).agg({
    "duration_ms": ["count", "mean", "std", "min", "max"],
    "pairs": ["mean", "std"]
}).round(2)

# Make the column names more readable
summary.columns = [f"{col[0]}_{col[1]}" for col in summary.columns]
summary = summary.reset_index()

# Add coefficient of variation (CV) to measure consistency
summary["duration_cv_percent"] = (summary["duration_ms_std"] / summary["duration_ms_mean"] * 100).round(2)

# Sort by algorithm, sample size, and mean duration
summary = summary.sort_values(["algorithm", "sample_users", "mode", "duration_ms_mean"])

print("\nDetailed Summary Statistics:")
print(f"Total configurations tested: {len(summary)}")
print("\nResults:")
summary

KeyError: 'dim'

In [None]:
# Create subplots for different sample sizes
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle("Runtime Analysis by Sample Size", y=1.02, fontsize=14)

# 1. Runtime vs Cores by Sample Size
ax1 = axes[0, 0]
sns.lineplot(
    data=df,
    x="num_cores", y="duration_ms",
    hue="mode", style="algorithm",
    markers=True, dashes=False,
    errorbar="sd",
    ax=ax1
)
ax1.set_title("Runtime vs Number of Cores")
ax1.set_ylabel("Duration (ms)")
ax1.set_xlabel("Number of Cores")
ax1.grid(True)

# 2. Runtime vs Sample Size
ax2 = axes[0, 1]
sns.lineplot(
    data=df,
    x="sample_users", y="duration_ms",
    hue="mode", style="algorithm",
    markers=True, dashes=False,
    errorbar="sd",
    ax=ax2
)
ax2.set_title("Runtime vs Sample Size")
ax2.set_ylabel("Duration (ms)")
ax2.set_xlabel("Number of Users")
ax2.grid(True)

# 3. Pairs vs Sample Size
ax3 = axes[1, 0]
sns.lineplot(
    data=df,
    x="sample_users", y="pairs",
    hue="algorithm",
    markers=True, dashes=False,
    ax=ax3
)
ax3.set_title("Number of Pairs vs Sample Size")
ax3.set_ylabel("Number of Pairs")
ax3.set_xlabel("Number of Users")
ax3.grid(True)

# 4. Runtime per Pair
ax4 = axes[1, 1]
df["time_per_pair"] = df["duration_ms"] / df["pairs"]
sns.lineplot(
    data=df,
    x="sample_users", y="time_per_pair",
    hue="mode", style="algorithm",
    markers=True, dashes=False,
    errorbar="sd",
    ax=ax4
)
ax4.set_title("Time per Pair vs Sample Size")
ax4.set_ylabel("Time per Pair (ms)")
ax4.set_xlabel("Number of Users")
ax4.grid(True)

plt.tight_layout()
plt.show()

# Speed-up analysis
plt.figure(figsize=(12, 6))
speedup_data = df.pivot_table(
    index=["algorithm", "sample_users", "num_cores", "run"],
    columns="mode",
    values="duration_ms"
).reset_index()
speedup_data["speedup"] = speedup_data["sequential"] / speedup_data["concurrent"]

sns.lineplot(
    data=speedup_data,
    x="num_cores", y="speedup",
    hue="sample_users", style="algorithm",
    markers=True, dashes=False,
    errorbar="sd"
)
plt.axhline(y=1, color='r', linestyle='--', alpha=0.5)
plt.title("Speed-up Factor (Sequential/Concurrent)")
plt.ylabel("Speed-up Factor (>1 means concurrent is faster)")
plt.xlabel("Number of Cores")
plt.grid(True)
plt.show()

In [None]:
# Build per-run pivot to pair seq/con, then average to be robust
per_run = df.pivot_table(
    index=["algorithm", "sample_users", "sample_items", "num_cores", "run"],
    columns="mode",
    values=["duration_ms", "pairs"],
    aggfunc="first"
).reset_index()

# Compute ratios per run
per_run["ratio_con_over_seq"] = per_run[("duration_ms", "concurrent")] / per_run[("duration_ms", "sequential")]
per_run["pct_con_of_seq"]     = 100 * per_run["ratio_con_over_seq"]
per_run["pct_improve"]        = 100 * (1 - per_run["ratio_con_over_seq"])

# Average across repeats for plotting
pct_summary = per_run.groupby(["algorithm", "sample_users", "sample_items", "num_cores"], as_index=False).agg(
    mean_pct_con_of_seq=("pct_con_of_seq", "mean"),
    sd_pct_con_of_seq=("pct_con_of_seq", "std"),
    mean_pct_improve=("pct_improve", "mean"),
    sd_pct_improve=("pct_improve", "std"),
    pairs=("pairs", "first", "sequential"),  # all runs have same pair count
)

# A) Concurrent as % of sequential (target < 100%)
g1 = sns.relplot(
    data=pct_summary, kind="line",
    x="num_cores", y="mean_pct_con_of_seq",
    hue="sample_users", col="algorithm", col_wrap=2,
    markers=True, dashes=False,
    errorbar=None
)
for ax, (alg) in zip(g1.axes.flatten(), pct_summary["algorithm"].unique()):
    ax.axhline(100, ls="--", lw=1, color="gray")  # baseline: equal speed
g1.set_axis_labels("Number of Cores", "Concurrent as % of Sequential (↓ better)")
g1.fig.suptitle("Concurrent runtime relative to Sequential (mean across repeats)", y=1.02)
plt.show()

# B) % improvement (target > 0%)
g2 = sns.relplot(
    data=pct_summary, kind="line",
    x="num_cores", y="mean_pct_improve",
    hue="sample_users", col="algorithm", col_wrap=2,
    markers=True, dashes=False,
    errorbar=None
)
for ax in g2.axes.flatten():
    ax.axhline(0, ls="--", lw=1, color="gray")  # baseline: no improvement
g2.set_axis_labels("Number of Cores", "Improvement of Concurrent vs Sequential (%) (↑ better)")
g2.fig.suptitle("Percentage improvement from concurrency (mean across repeats)", y=1.02)
plt.show()

# C) Scatterplot of runtime vs number of pairs
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=df,
    x="pairs", y="duration_ms",
    hue="mode", style="num_cores",
    alpha=0.6
)
plt.xscale("log")
plt.yscale("log")
plt.title("Runtime vs Number of Pairs Computed")
plt.xlabel("Number of Pairs")
plt.ylabel("Duration (ms)")
plt.show()

# D) Sample size analysis
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=df,
    x="sample_users", y="duration_ms",
    hue="mode", style="num_cores",
    alpha=0.6
)
plt.title("Runtime vs Sample Size")
plt.xlabel("Number of Users Sampled")
plt.ylabel("Duration (ms)")
plt.show()