# WASM Benchmark Comparison

This notebook allows you to compare the performance of WASM benchmarks across two different git revisions.


In [None]:
# Install dependencies if they are missing
import sys
import subprocess

def install_package(package):
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    import matplotlib
except ImportError:
    install_package("matplotlib")

try:
    import numpy
except ImportError:
    install_package("numpy")


In [None]:
import os
import subprocess
import re
import csv
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import json

def run_command(command, cwd=None):
    print(f"Executing: {command}")
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=cwd)
    output = []
    for line in process.stdout:
        print(line, end='')
        output.append(line)
    process.wait()
    if process.returncode != 0:
        raise Exception(f"Command failed with exit code {process.returncode}")
    return "".join(output)


In [None]:
def parse_benchmark_results(file_path):
    if not os.path.exists(file_path):
        return []
    
    entries = []
    with open(file_path, 'r', newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if not row.get('Median'):
                continue
            entry = {
                'test': os.path.basename(row['Test'].strip()),
                'mode': row['Mode'],
                'median': int(row['Median'])
            }
            entries.append(entry)
            
    return entries


In [None]:
def run_benchmarks_for_revision(revision, test_filter="org.jetbrains.kotlin.wasm.test.FirWasmSpecCodegenBenchmarkTestGenerated"):
    # 0. Save current branch/commit
    original_rev = subprocess.check_output("git rev-parse --abbrev-ref HEAD", shell=True, text=True).strip()
    if original_rev == "HEAD":
        original_rev = subprocess.check_output("git rev-parse HEAD", shell=True, text=True).strip()
        
    # 1. Stash current changes
    # run_command("git stash")
    
    try:
        # 2. Checkout revision
        run_command(f"git checkout {revision}")
        
        # 3. Clear old results file if it exists
        results_file = "benchmark_results.csv"
        if os.path.exists(results_file):
            os.remove(results_file)
            
        # 4. Run benchmarks
        # Note: we use --no-daemon to avoid issues with different compiler versions in the same daemon
        try:
            run_command(f"./gradlew :wasm:wasm.tests:test --tests {test_filter} --no-daemon")
        except Exception as e:
            print(f"Warning: Benchmark execution failed for revision {revision}.")
            print(f"Error: {e}")
            print(f"Continuing to parse whatever results were produced.")
        
        # 5. Parse and return results
        return parse_benchmark_results(results_file)
        
    finally:
        # Restore original state (best effort)
        run_command(f"git checkout {original_rev}")


## Comparison Execution

Specify the two revisions you want to compare below. They can be commit hashes, branch names, or tags.


In [None]:
REVISION_A = "vsirotkina/benchmark-no-ss"
REVISION_B = "vsirotkina/benchmark-with-ss"
TEST_FILTER = "org.jetbrains.kotlin.wasm.test.FirWasmSpecCodegenBenchmarkTestGenerated"

print(f"Starting benchmark for Revision A: {REVISION_A}")
results_a = run_benchmarks_for_revision(REVISION_A, TEST_FILTER)

print(f"\nStarting benchmark for Revision B: {REVISION_B}")
results_b = run_benchmarks_for_revision(REVISION_B, TEST_FILTER)


In [None]:
def plot_comparison(results_a, results_b, label_a="Revision A", label_b="Revision B"):
    # Organize data by (test, mode)
    data = {}
    
    for r in results_a:
        key = (r['test'], r['mode'])
        if key not in data: data[key] = {}
        data[key]['a'] = r['median']
        
    for r in results_b:
        key = (r['test'], r['mode'])
        if key not in data: data[key] = {}
        data[key]['b'] = r['median']
        
    # Filter keys that have both results
    common_keys = [k for k in data if 'a' in data[k] and 'b' in data[k]]
    common_keys.sort()
    
    if not common_keys:
        print("No common results found to compare.")
        return
        
    labels = [f"{k[0]} ({k[1]})" for k in common_keys]
    vals_a = [data[k]['a'] / 1_000_000 for k in common_keys]  # Convert to ms
    vals_b = [data[k]['b'] / 1_000_000 for k in common_keys]  # Convert to ms
    
    x = np.arange(len(labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 8))
    rects1 = ax.bar(x - width/2, vals_a, width, label=label_a)
    rects2 = ax.bar(x + width/2, vals_b, width, label=label_b)
    
    ax.set_ylabel('Median Time (ms)')
    ax.set_title('Benchmark Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print summary table
    print(f"{ 'Test (Mode)':<50} | {label_a:>12} | {label_b:>12} | { 'Diff (%)':>10}")
    print("-" * 95)
    for i, k in enumerate(common_keys):
        a, b = data[k]['a'], data[k]['b']
        diff = (b - a) / a * 100
        print(f"{labels[i]:<50} | {vals_a[i]:12.2f} | {vals_b[i]:12.2f} | {diff:+10.2f}%")

if 'results_a' in locals() and 'results_b' in locals():
    plot_comparison(results_a, results_b, REVISION_A, REVISION_B)
