In [20]:
!pip install requests pandas tqdm ipywidgets



# Collecting Data

In [21]:
# Imports & Setup
import time
import requests
import pandas as pd
import statistics
from pathlib import Path
import logging
from tqdm.notebook import tqdm

# === CONFIGURE THESE ===
CSV_FILE_PATH = Path("data/input/produtos.csv")     # ← your CSV
BASE_URL         = "http://localhost:8083"
ENDPOINTS        = {
    "parallel":   "/upload",
}
BATCH_SIZES      = [250, 500, 1000]
ITERATIONS       = 10
OUTPUT_CSV       = Path("data/output/resultsv1.csv")


In [22]:
# Helper to run one test
def run_test(endpoint: str, batch_size: int) -> float:
    """
    POST the CSV to `endpoint` with X-Batch-Size header,
    return elapsed time in seconds.
    """
    url = BASE_URL + endpoint
    headers = {"X-Batch-Size": str(batch_size)}
    files = {"file": (CSV_FILE_PATH.name, open(CSV_FILE_PATH, "rb"), "text/csv")}
    start = time.perf_counter()
    resp = requests.post(url, files=files, headers=headers)
    elapsed = time.perf_counter() - start

    resp.raise_for_status()  # fail early if something went wrong
    return elapsed


In [23]:
# Execute all tests
# — configure logging to include timestamps —
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S"
)

records = []
total_runs = len(ENDPOINTS) * len(BATCH_SIZES) * ITERATIONS

# wrap all individual POST calls in one overall progress bar
with tqdm(total=total_runs, desc="Running tests") as pbar:
    for mode, ep in ENDPOINTS.items():
        logging.info(f"Starting tests for mode='{mode}' → endpoint '{ep}'")
        for size in BATCH_SIZES:
            logging.info(f"  Batch size: {size}")
            times = []
            for i in range(1, ITERATIONS + 1):
                elapsed = run_test(ep, size)
                times.append(elapsed)
                logging.info(f"    Iter {i}/{ITERATIONS}: {elapsed*1000:.1f} ms")
                pbar.update(1)

            # compute stats for this combination
            records.append({
                "mode":       mode,
                "endpoint":   ep,
                "batch_size": size,
                "iterations": ITERATIONS,
                "mean_s":     statistics.mean(times),
                "variance_s": statistics.pvariance(times),
                "stdev_s":    statistics.pstdev(times),
            })


Running tests:   0%|          | 0/30 [00:00<?, ?it/s]

17:39:48 [INFO] Starting tests for mode='parallel' → endpoint '/upload'
17:39:48 [INFO]   Batch size: 250


ConnectionError: HTTPConnectionPool(host='localhost', port=8083): Max retries exceeded with url: /upload (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7e7093335bf0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [24]:
# Summarize & Save
df = pd.DataFrame.from_records(records)
# Convert seconds → milliseconds for readability
for col in ("mean_s", "variance_s", "stdev_s"):
    df[col.replace("_s","_ms")] = df[col] * 1000

# Reorder columns
df = df[[
    "mode", "endpoint", "batch_size", "iterations",
    "mean_ms", "variance_s", "stdev_ms"
]]
df.columns = [
    "Mode", "Endpoint", "Batch Size", "Iterations",
    "Mean (ms)", "Variance (s²)", "Std Dev (ms)"
]

# Display and write out
print(df)
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nResults written to {OUTPUT_CSV}")


KeyError: 'mean_s'

In [25]:
df = pd.read_csv(OUTPUT_CSV)
df

Unnamed: 0,Batch Size,Iterations,Mean (ms),Variance (s²),Std Dev (ms)
0,250,10,77333.424783,3.333847,1825.882619
1,500,10,69607.899422,2.258335,1502.775626
2,1000,10,64530.726252,1.411442,1188.041297


# Insights

Inspecting Golang API goroutines dump using pprof I came in some insights about what may be slowing down the threads.
Almost all workers are blocked in the HTTP Client waiting on network calls reads (```net/http.(*Transport).roundTrip```, ```crypto/tls```, ```bufio``` and ```compress/gzip calls```). That is clearly a I/O bottleneck. Problably cost of decompressing/parsing JSON.

In the next steps I will optimize with that I/O bound works in mind and collect data for that optimization.