In [2]:
import dask.dataframe as dd
import pandas as pd
import time

datasets = {
    "zyda_main": "hf://datasets/nhagar/zyda_urls/**/*.parquet",
    "zyda_fwe3": "hf://datasets/nhagar/zyda-2_urls_fwe3/**/*.parquet",
    "zyda_dclm_crossdeduped": "hf://datasets/nhagar/zyda-2_urls_dclm_crossdeduped/**/*.parquet",
    "dclm_baseline_batch4": "hf://datasets/nhagar/dclm-baseline-1.0-parquet_urls/batch_4/train-*.parquet",
    "dclm_dedup": "hf://datasets/nhagar/dclm-dedup_urls/**/*.parquet",
    "falcon_refinedweb": "hf://datasets/nhagar/falcon-refinedweb_urls/batch*/train-*.parquet",
    "falcon_main": "hf://datasets/nhagar/falcon_urls/data/train-*.parquet",
    "c4_en": "hf://datasets/nhagar/c4_en_urls/data/train-*.parquet",
    "cultura": "hf://datasets/nhagar/cultura_urls/data/train-*.parquet"
}

dataframes = {}
for name, path in datasets.items():
    start_time = time.time()
    print(f"Loading {name}...")
    dataframes[name] = dd.read_parquet(path)
    elapsed = time.time() - start_time
    print(f"  {name} took {elapsed:.2f}s")

Loading zyda_main...
  zyda_main took 14.77s
Loading zyda_fwe3...
  zyda_fwe3 took 1.89s
Loading zyda_dclm_crossdeduped...
  zyda_dclm_crossdeduped took 3.44s
Loading dclm_baseline_batch4...
  dclm_baseline_batch4 took 3.58s
Loading dclm_dedup...
  dclm_dedup took 1.46s
Loading falcon_refinedweb...
  falcon_refinedweb took 11.47s
Loading falcon_main...
  falcon_main took 20.29s
Loading c4_en...
  c4_en took 8.82s
Loading cultura...
  cultura took 149.78s


In [3]:
for name, df in dataframes.items():
    print(f"{name}:")
    print(f"  Rows: {df.shape[0].compute()}")  # Compute needed for dask dataframes
    print(f"  Columns: {df.shape[1]}")
    #print(f"  Memory usage: {df.memory_usage(deep=True).sum().compute() / 1024**2:.2f} MB")

zyda_main:
  Rows: 1174929887
  Columns: 2
zyda_fwe3:
  Rows: 1279083175
  Columns: 2
zyda_dclm_crossdeduped:
  Rows: 2590493536
  Columns: 2
dclm_baseline_batch4:
  Rows: 194668059
  Columns: 2
dclm_dedup:
  Rows: 615197319
  Columns: 2
falcon_refinedweb:
  Rows: 968000015
  Columns: 2
falcon_main:
  Rows: 968000015
  Columns: 2
c4_en:
  Rows: 365233500
  Columns: 2
cultura:
  Rows: 7184124703
  Columns: 2


In [4]:
# get labeled domains
domain_labels = pd.read_csv('https://raw.githubusercontent.com/LazerLab/DomainDemo/refs/heads/main/data/existing_labels/news_local_natioanl_classification.csv')

In [5]:
domain_labels.classification.value_counts()

classification
local           12102
national          763
INCONSISTENT       40
Name: count, dtype: int64

In [6]:
domain_labels.sample(5)


Unnamed: 0,domain,classification
12547,jewishinseattle.org,local
10611,edmondlifeandleisure.com,local
643,jg-tc.com,local
8162,dvcinquirer.com,local
3229,grafwv.com,local


In [7]:
# Dictionary to store results
classification_counts = {}

# Process each dataframe
for name, df in dataframes.items():
    start_time = time.time()
    print(f"Processing {name}...")
    
    # Take a 1% random sample
    print("  Sampling...")
    #sample_size = int(df.shape[0].compute() * 0.01)
    sample_size = 100000
    sampled_df = df.head(sample_size)#.compute()
    #sampled_df = df.compute()
    
    if 'url' in sampled_df.columns:
        print("  Extracting domains...")
        sampled_df['domain'] = sampled_df['url'].str.extract(r'https?://(?:www\.)?([^/]+)')
        
        print("  Merging domains...")
        merged_df = pd.merge(sampled_df[['domain']], domain_labels, on='domain', how='left')
        
        print("  Counting domains...")
        classification_counts[name] = merged_df['classification'].value_counts(normalize=True) * 100
        
        print(f"  Sample size: {len(sampled_df)} rows")
        for classification, percentage in classification_counts[name].items():
            print(f"    {classification}: {percentage:.2f}%")
        
        unmatched = merged_df['classification'].isna().sum()
        if unmatched > 0:
            print(f"    Unmatched: {unmatched} ({unmatched/len(merged_df)*100:.2f}%)")
    else:
        print(f"  No URL column found in {name}")
    
    elapsed_time = time.time() - start_time
    print(f"  Time taken: {elapsed_time:.2f} seconds\n")

# Create a summary dataframe
summary_df = pd.DataFrame(classification_counts).fillna(0)
print("Overall summary (%):")
print(summary_df)

Processing zyda_main...
  Sampling...
  Extracting domains...
  Merging domains...
  Counting domains...
  Sample size: 100000 rows
    local: 54.45%
    national: 43.81%
    INCONSISTENT: 1.74%
    Unmatched: 94426 (94.43%)
  Time taken: 15.60 seconds

Processing zyda_fwe3...
  Sampling...
  Extracting domains...
  Merging domains...
  Counting domains...
  Sample size: 100000 rows
    national: 51.98%
    local: 47.03%
    INCONSISTENT: 0.99%
    Unmatched: 89960 (89.96%)
  Time taken: 16.52 seconds

Processing zyda_dclm_crossdeduped...
  Sampling...
  Extracting domains...
  Merging domains...
  Counting domains...
  Sample size: 100000 rows
    national: 56.54%
    local: 39.87%
    INCONSISTENT: 3.59%
    Unmatched: 94648 (94.65%)
  Time taken: 16.63 seconds

Processing dclm_baseline_batch4...
  Sampling...
  Extracting domains...
  Merging domains...
  Counting domains...
  Sample size: 100000 rows
    national: 57.71%
    local: 39.28%
    INCONSISTENT: 3.02%
    Unmatched: 8473