In [1]:
from huggingface_hub import hf_hub_download, list_repo_files
from tqdm import tqdm
import dask.dataframe as dd
import glob
import os
import pandas as pd

labels = pd.read_csv('../data/combined_domain_labels_16k_splits.csv')
labels.sample(10)

Unnamed: 0,domain,label,label_source,set
1650,nulatotribe.net,General Information & Education,data_provenance_init,train
15896,yourglenrosetx.com,News,northeastern_domain_demo,train
4608,crisismagazine.com,News,northeastern_domain_demo,train
1365,autosport.com,News,data_provenance_init,train
6529,cortlandstandard.net,News,northeastern_domain_demo,train
425,plantcell.org,"Science, Academia, & Technology",data_provenance_init,train
1424,tumgir.com,Social Media/Forums,data_provenance_init,train
5829,adelnews.com,News,northeastern_domain_demo,test
14629,kxl.com,News,northeastern_domain_demo,val
508,occupywallst.org,Entertainment & Culture,data_provenance_init,train


In [2]:
# Load datasets
datasets = {
    #"zyda_main": "hf://datasets/nhagar/zyda_urls/**/*.parquet",
    "zyda_fwe3": "hf://datasets/nhagar/zyda-2_urls_fwe3/**/*.parquet",
    "zyda_dclm_crossdeduped": "hf://datasets/nhagar/zyda-2_urls_dclm_crossdeduped/**/*.parquet",
    "dclm_baseline_batch4": "hf://datasets/nhagar/dclm-baseline-1.0-parquet_urls/batch_4/train-*.parquet",
    "dclm_dedup": "hf://datasets/nhagar/dclm-dedup_urls/**/*.parquet",
    "falcon_refinedweb": "hf://datasets/nhagar/falcon-refinedweb_urls/batch*/train-*.parquet",
    "falcon_main": "hf://datasets/nhagar/falcon_urls/data/train-*.parquet",
    "c4_en": "hf://datasets/nhagar/c4_en_urls/data/train-*.parquet",
    "cultura": "hf://datasets/nhagar/cultura_urls/data/train-*.parquet"
}

In [3]:
DATASETS = ["nhagar/zyda-2_urls_zyda_crossdeduped-filtered",
            #"nhagar/falcon_urls"]
           ]

           
for dataset in tqdm(DATASETS):
    try:
        # Get files list from repo
        files = [f for f in list_repo_files(dataset, repo_type="dataset") 
                if f.endswith('.parquet')]
        
        if not files:
            print(f"No parquet files found for {dataset}, skipping")
            continue
            
        print(f"Found {len(files)} parquet files for {dataset}")
        print(f"Downloading all parquet files...")

        downloaded_files = []
        
        for file in files:
            download_path = hf_hub_download(
                repo_id=dataset,
                filename=file,
                repo_type="dataset",
                cache_dir="hf_cache"
            )
            downloaded_files.append(download_path)
        
        print(f"Downloaded {len(downloaded_files)} files")
        print(f"First file path: {downloaded_files[0]}")
        
        # Use the actual downloaded paths directly
        print("Processing...")
        df = dd.read_parquet(downloaded_files).compute()
        
    except Exception as e:
        print(f"Error with {dataset}: {str(e)}")
        # Save progress on error
        dataset_name = dataset.replace('nhagar/','')

  0%|                                                       | 0/1 [00:00<?, ?it/s]

Found 1 parquet files for nhagar/zyda-2_urls_zyda_crossdeduped-filtered
Downloading all parquet files...
Downloaded 1 files
First file path: hf_cache/datasets--nhagar--zyda-2_urls_zyda_crossdeduped-filtered/snapshots/695209cf7133a596fc999304fa623e802439281f/batch_1.parquet
Processing...


100%|███████████████████████████████████████████████| 1/1 [00:07<00:00,  7.13s/it]


In [4]:
# Filter to labeled domains
df.head(5)

Unnamed: 0,url,domain
0,https://www.hennsnoxlaw.com/faqs,hennsnoxlaw.com
1,https://store.basscentral.com/dingwall/dingwal...,basscentral.com
2,http://theplayfullife.polarnopyretusa.com/name...,polarnopyretusa.com
3,http://www.katephillipsevents.com/contact,katephillipsevents.com
4,https://www.littleroomunderthestairs.com/2015/...,littleroomunderthestairs.com


In [13]:
import time
from tqdm.notebook import tqdm

def filter_with_progress(df, domain_set, batch_size=100000):
    start_time = time.time()
    total_rows = len(df)
    filtered_rows = []
    
    for i in tqdm(range(0, total_rows, batch_size), desc="Filtering domains"):
        batch = df.iloc[i:min(i+batch_size, total_rows)]
        filtered_batch = batch[batch['domain'].isin(domain_set)]
        filtered_rows.append(filtered_batch)
        
        # Show additional progress info
        if (i + batch_size) % (batch_size * 10) == 0 or (i + batch_size) >= total_rows:
            elapsed = time.time() - start_time
            #print(f"Processed {min(i+batch_size, total_rows)}/{total_rows} rows ({(min(i+batch_size, total_rows)/total_rows)*100:.1f}%) in {elapsed:.1f}s")
    
    return pd.concat(filtered_rows, ignore_index=True)

# Use the function
domain_set = set(labels[labels.set=='train']['domain'])
filtered_df = filter_with_progress(df, domain_set)

Filtering domains:   0%|          | 0/1912 [00:00<?, ?it/s]

In [14]:
# Get 3 urls from each domain

In [7]:
from collections import defaultdict

# Initialize a dictionary to store up to 3 URLs for each domain
domain_urls = defaultdict(list)

# Process the filtered dataframe row by row
for _, row in filtered_df.iterrows():
    domain = row['domain']
    url = row['url']
    
    # Store up to 3 URLs per domain
    if len(domain_urls[domain]) < 3:
        domain_urls[domain].append(url)
    
    # Early stopping if we have 3 URLs for all domains
    if all(len(urls) >= 3 for urls in domain_urls.values() if domain in domain_set):
        break

# Convert the dictionary to a dataframe
result_data = []
for domain, urls in domain_urls.items():
    for url in urls:
        result_data.append({'domain': domain, 'url': url})

result_df = pd.DataFrame(result_data)

In [15]:
# Extract features
len(filtered_df)

36526523

In [16]:
filtered_df.sample(10)

Unnamed: 0,url,domain
36300902,http://www.screendaily.com/pliny-porter-named-...,screendaily.com
3815605,http://morrisbeginningart.weebly.com/painting....,weebly.com
3044769,https://www.marketwatch.com/investing/stock/SQ...,marketwatch.com
6838924,https://www.wvpublic.org/post/consol-selling-f...,wvpublic.org
4923515,https://textilelearner.blogspot.com/2018/10/le...,blogspot.com
21065383,http://www.tcpalm.com/entertainment/tcpalmsoci...,tcpalm.com
1237846,http://joykenney.blogspot.com/p/my-books.html,blogspot.com
24879872,http://www.itv.com/news/update/2012-12-07/char...,itv.com
30900689,http://www.marketwatch.com/story/yahoo-profit-...,marketwatch.com
12300656,http://thehockeywriters.com/fantasy-hockey-you...,thehockeywriters.com


In [9]:
# Classify