In [13]:
from collections import Counter

from datasets import load_dataset
from tqdm import tqdm

In [None]:
ds = load_dataset(
    "nhagar/c4_urls_en",
)

Downloading data: 100%|██████████| 183/183 [00:22<00:00,  8.03files/s]
Generating train split: 364868892 examples [00:50, 7239038.11 examples/s]


# Top sites

In [22]:
def get_top_n_sites(dataset, n=20):
    counter = Counter()
    for batch in tqdm(dataset.iter(batch_size=10_000), total=dataset.num_rows // 10_000):
        counter.update(batch["domain"])
    
    return counter.most_common(n)

In [23]:
topn = get_top_n_sites(ds["train"], 20)
print("Top 20 sites in C4:")
for site, count in topn:
    print(f"{site}: {count}")

36487it [08:31, 71.31it/s]                           


Top 20 sites in C4:
blogspot.com: 4576497
wordpress.com: 4125704
weebly.com: 483941
wikipedia.org: 353633
fanpop.com: 287426
livejournal.com: 253547
stackexchange.com: 219556
tripod.com: 217419
typepad.com: 213164
google.com: 209826
forumotion.com: 200420
reuters.com: 175151
yahoo.com: 174477
nytimes.com: 169965
b00kmedia.ru: 163084
thefreedictionary.com: 154103
fandom.com: 150738
indiatimes.com: 146598
latimes.com: 143282
microsoft.com: 137602


# NYT proportion - total

In [27]:
def filter_domains(batch):
    domains = batch["domain"]
    return [domain=="nytimes.com" for domain in domains]

In [28]:
nyt_slice = ds["train"].filter(
    filter_domains,
    batched=True,
    batch_size=10_000,
    num_proc=4,
)

print(f"NYT slice size: {nyt_slice.num_rows} (pct: {nyt_slice.num_rows / ds['train'].num_rows:.2%})")

Filter (num_proc=4): 100%|██████████| 364868892/364868892 [01:13<00:00, 4988332.49 examples/s]


NYT slice size: 169965 (pct: 0.05%)


# NYT proportion - sections

In [None]:
def extract_date_and_section(batch):
    urls = batch["url"]
    dates = []
    sections = []
    for url in urls:
        try:
            pathsplit = url.split(".com/")[-1].split("/")
            date = "-".join(pathsplit[0:3])
            section = pathsplit[3]
            dates.append(date)
            sections.append(section)
        except Exception as e:
            dates.append(None)
            sections.append(None)
    batch["date"] = dates
    batch["section"] = sections
    return batch

In [None]:
nyt_slice = nyt_slice.map(
    extract_date_and_section,
    batched=True,
    batch_size=10_000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/169965 [00:00<?, ? examples/s]

Error parsing URL https://www.nytimes.com/topic/destination/sudan?scp=1-spot&amp;sq=Sudan&amp;st=cse: list index out of rangeError parsing URL https://www.nytimes.com/topic/organization/national-education-association?inline=nyt-org: list index out of range

Error parsing URL https://www.nytimes.com/topic/organization/rutgers-the-state-university-of-new-jersey?inline=nyt-org: list index out of range
Error parsing URL https://www.nytimes.com/topic/person/frida-kahlo?inline=nyt-per: list index out of range
Error parsing URL https://www.nytimes.com/topic/person/john-mccain: list index out of range
Error parsing URL https://www.nytimes.com/topic/organization/pace-university?inline=nyt-org: list index out of range
Error parsing URL https://spiderbites.nytimes.com/1873/articles_1873_01_00001.html: list index out of range
Error parsing URL https://scientistatwork.blogs.nytimes.com/author/mark-hay/: list index out of rangeError parsing URL https://www.nytimes.com/topic/person/joseph-kabila: lis

# NYT proportion - time