In [19]:
import pandas as pd
import subprocess
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [20]:
INPUT_FILE = "./curl_http3_results.csv"
RESULTS_FILE = "./curl_http2_results.csv"
ERRORS_FILE = "./curl_http2_errors.csv"

In [21]:
df_input = pd.read_csv(INPUT_FILE).head(1000)
df_input

Unnamed: 0,url,time_appconnect,time_connect,time_namelookup,time_pretransfer,time_redirect,time_starttransfer,time_total,remote_ip,remote_port
0,https://www.google.com,0.115064,0.099295,0.080485,0.116441,0.0,0.172869,0.175498,142.251.209.132,443
1,https://www.facebook.com,0.092280,0.092280,0.071201,0.098314,0.0,0.286127,0.306126,157.240.252.35,443
2,https://www.youtube.com,0.088254,0.064662,0.054444,0.089367,0.0,0.174466,0.289153,142.251.209.142,443
3,https://www.googleapis.com,0.034300,0.009475,0.005403,0.035400,0.0,0.147281,0.147385,142.251.209.138,443
4,https://www.instagram.com,0.029055,0.013187,0.004605,0.029923,0.0,0.214219,0.336100,157.240.252.174,443
...,...,...,...,...,...,...,...,...,...,...
995,https://www.google.com.jm,0.227748,0.211538,0.192309,0.229158,0.0,0.295833,0.299376,142.250.181.195,443
996,https://www.repocket.com,0.043787,0.028749,0.021327,0.044902,0.0,0.071118,0.071261,172.67.139.50,443
997,https://www.thunderbird.net,0.075817,0.035113,0.026407,0.077077,0.0,0.573514,0.573551,172.67.74.82,443
998,https://www.bluecava.com,0.111648,0.111648,0.081835,0.116964,0.0,0.155698,0.155743,199.36.158.100,443


In [22]:
metrics_keys = [
    'time_appconnect',
    'time_connect',
    'time_namelookup',
    'time_pretransfer',
    'time_redirect',
    'time_starttransfer',
    'time_total',
    'remote_ip',
    'remote_port'
]

base_command = [
    'curl', '--http2',        # Force HTTP/2 over TCP+TLS
    '-4',                     # Use IPv4 only
    '-o', '/dev/null',        # Discard the actual output, only want metrics
    '-s',                     # Silent mode, suppress curl status messages
    '--max-time', '5',        # Set connection timeout to 5 seconds
    '-w', "\\n".join([f"{key}: %{{{key}}}" for key in metrics_keys]) + "\\n"  # Get specific metrics
]

In [23]:
results = []
errors = []

def process_url(row):
    url = row['url']
    command = base_command + [url]

    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)

        metrics = {'url': url}
        for line in result.stdout.splitlines():
            if ': ' in line:
                key, val = line.split(': ', 1)
                metrics[key.strip()] = val.strip()

        return metrics

    except subprocess.CalledProcessError as e:
        return {
            'url': url,
            'errorCode': e.returncode,
        }

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for index, row in df_input.iterrows():
        futures.append(executor.submit(process_url, row))

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing URLs", leave=True):
        try:
            data = future.result()
            if 'errorCode' in data:
                errors.append(data)
            else:
                results.append(data)
        except Exception as e:
            errors.append({
                'url': data.get('url', 'Unknown'),
                'errorCode': str(e),
            })

Processing URLs:   0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
df_results = pd.DataFrame(results)
df_errors= pd.DataFrame(errors)

In [25]:
url_order_dict = {url: idx for idx, url in enumerate(df_input['url'])}

df_results['sort_order'] = df_results['url'].map(url_order_dict)
df_results_sorted = df_results.sort_values('sort_order').drop(columns=['sort_order']).reset_index(drop=True)

df_errors['sort_order'] = df_errors['url'].map(url_order_dict)
df_errors_sorted = df_errors.sort_values('sort_order').drop(columns=['sort_order']).reset_index(drop=True)

In [27]:
df_results_sorted.to_csv(RESULTS_FILE, index=False)
df_errors_sorted.to_csv(ERRORS_FILE, index=False)