# load the model

In [1]:
import dask.dataframe as dd
import pandas as pd
import joblib
import time
import os

from dask.diagnostics import ProgressBar

hf_cache_dir = "/Volumes/JackBackup/HF_Datasets/hf_cache"
path_to_vectorizer = "saved_vectorizers/Full Path (Char).joblib"
path_to_model = "saved_models/Full Path (Char)_LogReg.joblib"
path_to_labels = "../data/combined_domain_labels_16k_splits.csv"

# Load saved vectorizer and model
print("Loading vectorizer and model...")
vectorizer = joblib.load(path_to_vectorizer)
logreg_model = joblib.load(path_to_model)
ground_truth_labels = pd.read_csv(path_to_labels)
print("Done!")

Loading vectorizer and model...
Done!


In [2]:
print("Starting load...")
start_time = time.time()

df = pd.read_parquet("hf://datasets/jackbandy/CC_aggregate/all_domains.parquet")

elapsed = time.time() - start_time
print(f"Loaded in {elapsed:.2f} seconds. Shape: {df.shape}")

Starting load...


  from .autonotebook import tqdm as notebook_tqdm


Loaded in 1721.83 seconds. Shape: (581590717, 2)


In [9]:
merged_df = ground_truth_labels.merge(
    df,
    left_on='domain',
    right_on='url_host_name',
    how='left'
)

[                                        ] | 0% Completed | 95.27 sms


KeyboardInterrupt: 

# Load Datasets

In [2]:
dataset_dict = {}
folders = sorted(os.listdir(hf_cache_dir))
total_folders = len(folders)

for idx, folder in enumerate(folders, start=1):
    folder_path = os.path.join(hf_cache_dir, folder)

    if not os.path.isdir(folder_path):
        continue

    # Recursively collect all .parquet files in the folder
    parquet_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet"):
                parquet_files.append(os.path.join(root, file))

    if not parquet_files:
        print(f"[{idx}/{total_folders}] Skipping {folder}: No Parquet files found.")
        continue

    try:
        df = dd.read_parquet(parquet_files)
        dataset_dict[folder] = df

        total_size_mb = sum(os.path.getsize(f) for f in parquet_files) / (1024 ** 2)
        print(f"[{idx}/{total_folders}] Loaded {folder} ({total_size_mb:.2f} MB, {len(parquet_files)} files)")

    except Exception as e:
        print(f"[{idx}/{total_folders}] Error loading {folder}: {str(e)}")

print(f"\n✅ Successfully loaded {len(dataset_dict)} datasets from {total_folders} folders.")

[1/25] Skipping .locks: No Parquet files found.
[2/25] Loaded datasets--nhagar--101_billion_arabic_words_dataset_urls (2137.77 MB, 1 files)
[3/25] Loaded datasets--nhagar--c4-chinese-zhtw_urls (153.28 MB, 1 files)
[4/25] Loaded datasets--nhagar--c4_urls_en (6756.48 MB, 51 files)
[5/25] Loaded datasets--nhagar--c4_urls_en.noblocklist (26034.57 MB, 197 files)
[6/25] Loaded datasets--nhagar--c4_urls_en.noclean (13051.02 MB, 95 files)
[7/25] Loaded datasets--nhagar--c4_urls_multilingual (71800.31 MB, 129 files)
[8/25] Loaded datasets--nhagar--c4_urls_realnewslike (985.97 MB, 7 files)
[9/25] Loaded datasets--nhagar--clean_mc4_it_urls (6980.00 MB, 1 files)
[10/25] Loaded datasets--nhagar--culturax_urls (93737.66 MB, 7 files)
[11/25] Loaded datasets--nhagar--dclm-baseline-1.0-parquet_urls (171937.27 MB, 15 files)
[12/25] Loaded datasets--nhagar--dclm-dedup_urls (17352.67 MB, 2 files)
[13/25] Loaded datasets--nhagar--dolma_urls_v1_5 (107439.17 MB, 9 files)
[14/25] Loaded datasets--nhagar--falc

# Apply Classifier Labels

In [3]:
# create dictionary, domain -> label
# load/add ground truth to dictionary
# for each url in each dataset...
    # if ground truth exists, use that
    # if classification label exists, use that
    # otherwise, add url to stash
    # if stash has seven urls, classify all 7, label domain based on majority

In [None]:
from collections import defaultdict, Counter
import numpy as np
import time

# Normalize ground truth labels to binary (1 = News, 0 = Not News)
print("Loading and converting ground truth labels...")
ground_truth_labels['is_news'] = ground_truth_labels['label'].apply(lambda x: 1 if 'News' in x else 0)
ground_truth_lookup = dict(zip(ground_truth_labels["domain"], ground_truth_labels["is_news"]))
print(f"Loaded {len(ground_truth_lookup)} ground truth domain labels.\n")

domain_label_dict = {}
classification_stash = defaultdict(list)
total_domains_prev = 0
progress_interval = 10_000  # print update every 10k domains

for dataset_idx, (folder, ddf) in enumerate(dataset_dict.items(), start=1):
    print(f"[{dataset_idx}/{len(dataset_dict)}] Processing dataset: {folder}")
    start_time = time.time()

    try:
        if "url" not in ddf.columns or "domain" not in ddf.columns:
            print(f"  Skipping: Missing 'url' or 'domain' column.")
            continue

        for part_idx in range(ddf.npartitions):
            part_df = ddf.partitions[part_idx][["url", "domain"]].dropna().compute()
            for url, domain in zip(part_df["url"], part_df["domain"]):
                if domain in domain_label_dict:
                    continue

                if domain in ground_truth_lookup:
                    domain_label_dict[domain] = ground_truth_lookup[domain]
                else:
                    classification_stash[domain].append(url)

                if len(classification_stash[domain]) == 7:
                    urls = classification_stash.pop(domain)
                    X = vectorizer.transform(urls)
                    preds = logreg_model.predict(X)
                    domain_label_dict[domain] = Counter(preds).most_common(1)[0][0]

                if len(domain_label_dict) % progress_interval == 0 and len(domain_label_dict) > total_domains_prev:
                    print(f"    ⏱️ Progress: {len(domain_label_dict):,} domains labeled...")
                    total_domains_prev = len(domain_label_dict)

            if (part_idx + 1) % 10 == 0 or part_idx == ddf.npartitions - 1:
                print(f"  Processed partition {part_idx + 1}/{ddf.npartitions} | Total labeled: {len(domain_label_dict):,}")

    except Exception as e:
        print(f"  ❌ Error in {folder}, partition {part_idx}: {e}")

    elapsed = time.time() - start_time
    print(f"  ✅ Finished {folder} in {elapsed:.1f}s | Total domains labeled: {len(domain_label_dict):,}")

print(f"\n🏁 All datasets processed. Final total labeled domains: {len(domain_label_dict):,}")

Loading and converting ground truth labels...
Loaded 16756 ground truth domain labels.

[1/24] Processing dataset: datasets--nhagar--101_billion_arabic_words_dataset_urls
    ⏱️ Progress: 10,000 domains labeled...
    ⏱️ Progress: 20,000 domains labeled...
    ⏱️ Progress: 30,000 domains labeled...
    ⏱️ Progress: 40,000 domains labeled...
    ⏱️ Progress: 50,000 domains labeled...
    ⏱️ Progress: 60,000 domains labeled...
    ⏱️ Progress: 70,000 domains labeled...
  Processed partition 10/19 | Total labeled: 72,844
    ⏱️ Progress: 80,000 domains labeled...
    ⏱️ Progress: 90,000 domains labeled...
  Processed partition 19/19 | Total labeled: 93,910
  ✅ Finished datasets--nhagar--101_billion_arabic_words_dataset_urls in 258.5s | Total domains labeled: 93,910
[2/24] Processing dataset: datasets--nhagar--c4-chinese-zhtw_urls
    ⏱️ Progress: 100,000 domains labeled...
    ⏱️ Progress: 110,000 domains labeled...
    ⏱️ Progress: 120,000 domains labeled...
    ⏱️ Progress: 130,000 doma

In [5]:
part_df

Unnamed: 0,url,domain
0,https://alertdiver.eu/en_US/articles/finding-w...,alertdiver.eu
1,https://aliendjinnromances.blogspot.com/2019/0...,blogspot.com
2,https://allaccesspasstojack.blogspot.com/2010/...,blogspot.com
3,https://alovelettertorome.com/2014/11/,alovelettertorome.com
4,https://antiracistorg.bethechangeconsulting.co...,bethechangeconsulting.com
...,...,...
5203613,https://chessarbiter.info/en/rules/article1,chessarbiter.info
5203614,https://coconutcreamcare.com/manual-therapy/do...,coconutcreamcare.com
5203615,https://collegebasketball.nbcsports.com/2012/0...,nbcsports.com
5203616,https://collegetribune.ie/letter-to-the-editor...,collegetribune.ie


In [6]:
domain_label_dict

{'airbnb.com': 0,
 'dhgate.com': 0,
 'alibaba.com': 0,
 'euronews.com': 1,
 'iherb.com': 0,
 'france24.com': 1,
 'watanserb.com': 1,
 'rt.com': 1,
 'sputniknews.com': 1,
 'tripadvisor.com': 0,
 'web.app': 0.0,
 'wikipedia.org': 0.0,
 'motorsport.com': 1,
 'hm.com': 0.0,
 'watan.com': 1,
 'cnn.com': 1,
 'worldbank.org': 1,
 'alarab.com': 0.0,
 'wordpress.com': 1,
 'beiruttimes.com': 1,
 'popsugar.com': 0,
 'alkafeel.net': 0.0,
 'islamweb.net': 0.0,
 'almasryalyoum.com': 1.0,
 '.': 0.0,
 'blogspot.com': 1,
 'un.org': 0,
 'sa.com': 1.0,
 'microsoft.com': 0,
 'mine.nu': 0.0,
 'therecordofwilkes.com': 1,
 'wikihow.com': 1,
 'ksu.edu.sa': 0.0,
 'boots.com': 1.0,
 'encycolorpedia.com': 0.0,
 'player.fm': 0,
 'marriott.com': 0,
 'pnn.ps': 0.0,
 'nyc.gov': 1,
 'ahlamontada.com': 1.0,
 'from-yemen.com': 1.0,
 'mu.edu.sa': 0.0,
 'ahram.org.eg': 1.0,
 'khayma.com': 0.0,
 'enabbaladi.net': 0.0,
 'opendemocracy.net': 1,
 'globalvoices.org': 0,
 'assabah.ma': 0.0,
 'yoo7.com': 1.0,
 'jawwal.ps': 0.0,

In [7]:
len(domain_label_dict)

45060199

In [9]:
import json
with open('domain_label_dict.json', 'w') as f:
    json.dump(domain_label_dict, f)

In [10]:
part_df.to_csv('part_df.csv')