In [1]:
import dask.dataframe as dd
import pandas as pd
import tldextract
import time

start_time = time.time()
df = pd.read_parquet("hf://datasets/jackbandy/CC_aggregate/all_domains.parquet")
elapsed = time.time() - start_time
print(f"Took {elapsed:.2f}s")

Took 1323.54s


In [None]:
datasets = {
    "zyda_main": "hf://datasets/nhagar/zyda_urls/**/*.parquet",
    "zyda_fwe3": "hf://datasets/nhagar/zyda-2_urls_fwe3/**/*.parquet",
    "zyda_dclm_crossdeduped": "hf://datasets/nhagar/zyda-2_urls_dclm_crossdeduped/**/*.parquet",
    "dclm_baseline_batch4": "hf://datasets/nhagar/dclm-baseline-1.0-parquet_urls/batch_4/train-*.parquet",
    "dclm_dedup": "hf://datasets/nhagar/dclm-dedup_urls/**/*.parquet",
    "falcon_refinedweb": "hf://datasets/nhagar/falcon-refinedweb_urls/batch*/train-*.parquet",
    "falcon_main": "hf://datasets/nhagar/falcon_urls/data/train-*.parquet",
    "c4_en": "hf://datasets/nhagar/c4_en_urls/data/train-*.parquet",
    "cultura": "hf://datasets/nhagar/cultura_urls/data/train-*.parquet"
}

dataframes = {}
for name, path in datasets.items():
    print(f"Loading {name}...")
    dataframes[name] = dd.read_parquet(path)

Loading zyda_main...
Loading zyda_fwe3...
Loading zyda_dclm_crossdeduped...
Loading dclm_baseline_batch4...
Loading dclm_dedup...
Loading falcon_refinedweb...
Loading falcon_main...
Loading c4_en...
Loading cultura...


In [None]:
from tqdm.auto import tqdm
def create_domain_matrix(host_df, dataframes, sample=True, sample_frac=0.01):
    print(f"Processing {len(dataframes)} datasets for {len(host_df['url_host_name'].unique())} unique domains")
    all_domains = host_df['url_host_name'].unique()
    matrix = np.zeros((len(all_domains), len(dataframes)), dtype=np.int8)
    
    for col_idx, (dataset_name, df) in enumerate(tqdm(dataframes.items(), desc="Datasets")):
        start_time = time.time()        
        if 'url' not in df.columns:
            print(f"  WARNING: No URL column in {dataset_name}, skipping")
            continue
            
        url_sample = (df['url'].sample(frac=sample_frac) if sample else df['url']).compute()
        print(f"  {'Sample' if sample else 'Full dataset'}: {len(url_sample)} URLs from {dataset_name}")
        
        domains_set = set()
        for url in tqdm(url_sample, desc="  Extracting domains", unit="URL"):
            if pd.notna(url) and '//' in url:
                host = url.split('//')[1].split('/')[0]
                domains_set.add(host)
        
        domains_found = 0
        for row_idx, domain in enumerate(tqdm(all_domains, desc="  Matching domains")):
            if domain in domains_set:
                matrix[row_idx, col_idx] = 1
                domains_found += 1
        
        elapsed_time = time.time() - start_time
        print(f"  ✓ {dataset_name}: Found {domains_found} domains in {elapsed_time:.2f} seconds")
    
    return pd.DataFrame(matrix, index=all_domains, columns=list(dataframes.keys()))

# Usage
print("Starting domain matrix creation...")
start_time = time.time()
domain_matrix = create_domain_matrix(host_df=df, dataframes=dataframes, sample=True, sample_frac=0.01)
total_time = time.time() - start_time
print(f"Domain matrix creation finished in {total_time:.2f} seconds!")