In [1]:
import pandas as pd
import subprocess
from tqdm.auto import tqdm
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
INPUT_FILE = "./top-10k-preprocessed.csv"
RESULTS_FILE = "./curl_http3_results.csv"
ERRORS_FILE = "./curl_http3_errors.csv"

In [3]:
df_input = pd.read_csv(INPUT_FILE)
df_input

Unnamed: 0,url
0,https://www.google.com
1,https://www.microsoft.com
2,https://www.facebook.com
3,https://www.amazonaws.com
4,https://www.root-servers.net
...,...
9995,https://www.xn--l3cb1ea6bib.com
9996,https://www.jenkins.io
9997,https://www.nu.or.id
9998,https://www.doujin-y.com


In [4]:
metrics_keys = [
    'time_appconnect',
    'time_connect',
    'time_namelookup',
    'time_pretransfer',
    'time_redirect',
    'time_starttransfer',
    'time_total',
    'remote_ip',
    'remote_port'
]

base_command = [
    'curl', '--http3-only',  # Force HTTP/3
    '-4',               # Use IPv4 only
    '-o', '/dev/null',  # Discard the actual output, only want metrics
    '-s',               # Silent mode, suppress curl status messages
    '--max-time', '5',     # Set connection timeout to 5 seconds
    '-w', "\\n".join([f"{key}: %{{{key}}}" for key in metrics_keys]) + "\\n"  # Get specific metrics
]

In [29]:
results = []
errors = []

def process_url(row):
    url = row['url']
    command = base_command + [url]

    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)

        metrics = {'url': url}
        for line in result.stdout.splitlines():
            if ': ' in line:
                key, val = line.split(': ', 1)
                metrics[key.strip()] = val.strip()

        return metrics

    except subprocess.CalledProcessError as e:
        return {
            'url': url,
            'errorCode': e.returncode,
        }

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for index, row in df_input.iterrows():
        futures.append(executor.submit(process_url, row))

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing URLs", leave=True):
        try:
            data = future.result()
            if 'errorCode' in data:
                errors.append(data)
            else:
                results.append(data)
        except Exception as e:
            errors.append({
                'url': data.get('url', 'Unknown'),
                'errorCode': str(e),
            })

Processing URLs:   0%|          | 0/10000 [00:00<?, ?it/s]

In [6]:
df_results = pd.DataFrame(results)
df_errors = pd.DataFrame(errors)

In [14]:
url_order_dict = {url: idx for idx, url in enumerate(df_input['url'])}

df_results['sort_order'] = df_results['url'].map(url_order_dict)
df_results_sorted = df_results.sort_values('sort_order').drop(columns=['sort_order']).reset_index(drop=True)

df_errors['sort_order'] = df_errors['url'].map(url_order_dict)
df_errors_sorted = df_errors.sort_values('sort_order').drop(columns=['sort_order']).reset_index(drop=True)

In [17]:
df_results_sorted.to_csv(RESULTS_FILE, index=False)
df_errors_sorted.to_csv(ERRORS_FILE, index=False)