# Benchmark: V1, V2, V3 vs. Scikit-learn

This notebook benchmarks my Rust-based `MyRustLinearRegression`
(exposed via the `rust_core` module) against scikit-learn:

- `LinearRegression` (closed-form solution)
- `SGDRegressor` (iterative gradient-based optimizer)

The goal is to compare **training + prediction time** on the same synthetic dataset.


## Imports

In [1]:
import importlib
import sys
from pathlib import Path
import time
import platform
import numpy as np

# Ensure repo root is on sys.path so we can import the C++ wrapper
repo_root = Path.cwd().resolve().parents[1]
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

import rust_core
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from wrapper.cpp_linear_regression import LinearRegression as CppLinearRegression

importlib.reload(rust_core)

print("Python version:", platform.python_version())
print("NumPy version:", np.__version__)
print("Platform:", platform.platform())


Python version: 3.11.9
NumPy version: 2.0.1
Platform: Linux-6.14.0-35-generic-x86_64-with-glibc2.39


## Generate synthetic dataset

In [2]:
# Use a reasonably large dataset to see performance differences
rng = np.random.default_rng(0)

n_train = 200_000   
n_test = 10_000    
n_features = 40    
X_train = rng.normal(size=(n_train, n_features))
w_true = rng.normal(size=n_features)
y_train = X_train @ w_true + rng.normal(scale=0.1, size=n_train)

X_test = rng.normal(size=(n_test, n_features))

# Column-wise normalization (same as in functional tests)
norms = np.linalg.norm(X_train, axis=0)
norms[norms == 0.0] = 1.0

X_train_scaled = X_train / norms
X_test_scaled = X_test / norms

print("X_train_scaled:", X_train_scaled.shape)
print("X_test_scaled:", X_test_scaled.shape)
print("y_train:", y_train.shape)


X_train_scaled: (200000, 40)
X_test_scaled: (10000, 40)
y_train: (200000,)


## Benchmark helper

In [3]:
def bench(name, func, repeat=5):
    """Run `func` multiple times and print min/mean runtime."""
    times = []
    for _ in range(repeat):
        start = time.perf_counter()
        func()
        times.append(time.perf_counter() - start)
    times = np.array(times)
    print(f"{name}: min={times.min():.4f}s  mean={times.mean():.4f}s over {repeat} runs")


## scikit-learn: `LinearRegression` (closed-form baseline)

In [4]:
def run_sklearn_linear():
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    _ = model.predict(X_test_scaled)

# Warm-up run (JIT, caching, etc.)
run_sklearn_linear()

bench("sklearn LinearRegression", run_sklearn_linear)


sklearn LinearRegression: min=0.0638s  mean=0.0655s over 5 runs


## scikit-learn: `SGDRegressor` (iterative, closer to your Rust implementation)

In [None]:
def run_sklearn_sgd():
    model = SGDRegressor(
        learning_rate="constant",
        eta0=0.05,
        max_iter=1_000,
        penalty=None,
        random_state=0
    )
    model.fit(X_train_scaled, y_train)
    _ = model.predict(X_test_scaled)

# Warm-up
run_sklearn_sgd()

bench("sklearn SGDRegressor (1k iters)", run_sklearn_sgd)


sklearn SGDRegressor (1k iters): min=2.0431s  mean=2.1191s over 5 runs


## Rust: `MyRustLinearRegression`

In [6]:
def run_rust_linear(iterations=1_000):
    model = rust_core.MyRustLinearRegression(
        learning_rate=0.05,
        iterations=iterations,
        mode=rust_core.Mode.Regression,
    )
    model.fit(X_train_scaled, y_train)
    _ = model.predict(X_test_scaled)

# Warm-up
run_rust_linear(iterations=1_000)

bench("rust_core MyRustLinearRegression (1k iters)", lambda: run_rust_linear(1_000))


rust_core MyRustLinearRegression (1k iters): min=16.9185s  mean=17.1103s over 5 runs


## Optimized C++ implementation (ctypes wrapper)


In [7]:

def run_cpp_linear(iterations=1_000):
    model = CppLinearRegression(
        learning_rate=0.05,
        iterations=iterations,
    )
    model.fit(X_train_scaled, y_train)
    _ = model.predict(X_test_scaled)

run_cpp_linear()  # warm-up
bench("C++ LinearRegression (1000 iters)", run_cpp_linear)


C++ LinearRegression (1000 iters): min=1.1710s  mean=1.1830s over 5 runs
