In [1]:
import sys
from pathlib import Path
from datetime import datetime, timedelta
import json
from collections import defaultdict
import pandas as pd
import xxhash
from datasets import load_dataset
from tqdm import tqdm

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

print(f"CC-NEWS download started: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")

CC-NEWS download started: 2025-12-18T20:05:42.749405
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment


## 1. Load Configuration and Setup

In [2]:
configs = load_all_configs(workspace_root / 'configs')
news_cfg = configs['news']
global_cfg = configs['global']

# Output directory
silver_news_dir = workspace_root / 'data/01_silver/news'
silver_news_dir.mkdir(parents=True, exist_ok=True)

# Date range
start_date = datetime.strptime(global_cfg['validation_run']['period_start'], '%Y-%m-%d')
end_date = datetime.strptime(global_cfg['validation_run']['period_end'], '%Y-%m-%d')
total_days = (end_date - start_date).days + 1

# Sampling config
daily_limit = news_cfg['sampling']['daily_limit_n']
seed = global_cfg['validation_run']['seed_string']
hash_function = news_cfg['sampling']['hash_function']

print(f"Dataset: {news_cfg['source']['hf_dataset']}")
print(f"Date range: {start_date.date()} to {end_date.date()} ({total_days} days)")
print(f"Daily limit: {daily_limit:,} articles")
print(f"Seed: {seed}")
print(f"Hash function: {hash_function}")
print(f"Output: {silver_news_dir.relative_to(workspace_root)}")

Dataset: stanford-oval/ccnews
Date range: 2016-09-01 to 2016-10-31 (61 days)
Daily limit: 10,000 articles
Seed: thesis_sep_oct_2016_v1
Hash function: xxhash64
Output: data/01_silver/news


## 2. Define Hash-Based Sampling Function

In [3]:
def compute_hash(seed: str, url: str, date: str) -> int:
    """Compute deterministic hash from seed, url, and date."""
    combined = f"{seed}||{url}||{date}"
    return xxhash.xxh64(combined.encode('utf-8')).intdigest()

def parse_article_date(article):
    """Extract date from article, return YYYY-MM-DD string or None."""
    # CC-NEWS has 'date' field in format YYYY-MM-DD or YYYY-MM-DD HH:MM:SS
    date_str = article.get('date', '')
    if not date_str:
        return None
    
    # Extract just the date part (first 10 chars)
    try:
        return date_str[:10] if len(date_str) >= 10 else None
    except:
        return None

# Test the hash function
test_hash = compute_hash(seed, "https://example.com/article", "2016-09-01")
print(f"\nTest hash: {test_hash}")
print("Hash function ready.")


Test hash: 12131673047118061681
Hash function ready.


## 3. Load HuggingFace Dataset (Streaming Mode)

**Note:** This will stream the dataset without downloading everything. We'll filter for our date range.

In [4]:
print("Loading CC-NEWS dataset in streaming mode...")
print("This will take a moment to initialize...\n")

# Load dataset in streaming mode - use 2016 subset only
dataset = load_dataset(
    news_cfg['source']['hf_dataset'],
    name="2016",  # Only stream 2016 articles
    streaming=True
)

print("✓ Dataset loaded in streaming mode")
print("\nDataset info:")
print(f"  Name: {news_cfg['source']['hf_dataset']}")
print(f"  Year subset: 2016")
print(f"  Mode: Streaming (memory efficient)")

Loading CC-NEWS dataset in streaming mode...
This will take a moment to initialize...



Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

✓ Dataset loaded in streaming mode

Dataset info:
  Name: stanford-oval/ccnews
  Year subset: 2016
  Mode: Streaming (memory efficient)


## 4. Inspect Sample Articles

In [None]:
# Peek at first few articles to understand structure
print("Sample articles from dataset:\n")

for i, article in enumerate(dataset["train"].take(3)):
    print(f"Article {i+1}:")
    print(f"  Date: {article.get('date', 'N/A')}")
    print(f"  URL: {article.get('url', 'N/A')[:80]}...")
    print(f"  Title: {article.get('title', 'N/A')[:80]}...")
    print(f"  Text length: {len(article.get('text', ''))} chars")
    print(f"  Fields: {list(article.keys())}")
    print()

print("=" * 80)

Sample articles from dataset:



## 5. Stream, Filter, and Sample Articles

**Process:**
1. Stream through dataset
2. Filter articles in our date range (2016-09-01 to 2016-10-31)
3. Compute hash for each article
4. Group by date, keep top-10k smallest hashes per day

In [None]:
# Generate target dates
target_dates = set()
current = start_date
while current <= end_date:
    target_dates.add(current.strftime('%Y-%m-%d'))
    current += timedelta(days=1)

print(f"Target dates: {len(target_dates)} days from {min(target_dates)} to {max(target_dates)}")
print(f"\nStreaming and sampling articles...")
print(f"This may take 10-20 minutes depending on network speed.\n")

# Data structure: {date: [(hash, article), ...]}
daily_articles = defaultdict(list)

articles_seen = 0
articles_in_range = 0
articles_english = 0

# Stream through dataset - use dataset["train"] to access the split
for article in tqdm(dataset["train"], desc="Processing articles", unit=" articles"):
    articles_seen += 1
    
    # Parse date
    article_date = parse_article_date(article)
    
    # Skip if not in our date range
    if article_date not in target_dates:
        # Stop if we've seen many articles past our end date
        if article_date and article_date > max(target_dates):
            if articles_seen % 10000 == 0:
                # Check if we have enough articles for all dates
                dates_with_enough = sum(1 for articles in daily_articles.values() if len(articles) >= daily_limit)
                if dates_with_enough == len(target_dates):
                    print(f"\n✓ All {len(target_dates)} dates have {daily_limit:,}+ articles. Stopping.")
                    break
        continue
    
    articles_in_range += 1
    
    # Filter for English only
    language = article.get('language', '').lower()
    if language != 'en':
        continue
    
    articles_english += 1
    
    # Get required fields
    url = article.get('url', '')
    if not url:
        continue
    
    # Compute hash
    article_hash = compute_hash(seed, url, article_date)
    
    # Store with hash
    daily_articles[article_date].append((article_hash, article))
    
    # Progress update every 10k articles
    if articles_seen % 10000 == 0:
        dates_covered = len(daily_articles)
        tqdm.write(f"  Seen: {articles_seen:,} | In range: {articles_in_range:,} | English: {articles_english:,} | Dates: {dates_covered}/{len(target_dates)}")

print("\n" + "=" * 80)
print(f"Total articles seen: {articles_seen:,}")
print(f"Articles in date range: {articles_in_range:,}")
print(f"English articles: {articles_english:,}")
print(f"Dates with articles: {len(daily_articles)}/{len(target_dates)}")
print("=" * 80)

Target dates: 61 days from 2016-09-01 to 2016-10-31

Streaming and sampling articles...
This may take 10-20 minutes depending on dataset size and network speed.



Processing articles: 11315334 articles [1:42:49, 4265.96 articles/s]'The read operation timed out' thrown while requesting GET https://huggingface.co/datasets/stanford-oval/ccnews/resolve/d733e654c9a506df519e1a166a86c118c7657ce4/2017_0009.parquet
Retrying in 1s [Retry 1/5].
'The read operation timed out' thrown while requesting GET https://huggingface.co/datasets/stanford-oval/ccnews/resolve/d733e654c9a506df519e1a166a86c118c7657ce4/2017_0009.parquet
Retrying in 1s [Retry 1/5].
'[Errno 54] Connection reset by peer' thrown while requesting GET https://huggingface.co/datasets/stanford-oval/ccnews/resolve/d733e654c9a506df519e1a166a86c118c7657ce4/2017_0009.parquet
Retrying in 1s [Retry 1/5].
'The read operation timed out' thrown while requesting GET https://huggingface.co/datasets/stanford-oval/ccnews/resolve/d733e654c9a506df519e1a166a86c118c7657ce4/2017_0009.parquet
Retrying in 2s [Retry 2/5].
'The read operation timed out' thrown while requesting GET https://huggingface.co/datasets/stanfo

ReadTimeout: The read operation timed out

## 6. Select Top-K Articles per Day

In [9]:
print("\nSelecting top-k articles per day (smallest hashes)...\n")

daily_stats = []

for date in sorted(target_dates):
    if date not in daily_articles:
        print(f"⚠ Warning: No articles found for {date}")
        daily_stats.append({
            'date': date,
            'available': 0,
            'sampled': 0
        })
        continue
    
    articles = daily_articles[date]
    
    # Sort by hash (ascending) and take top-k
    articles.sort(key=lambda x: x[0])
    selected = articles[:daily_limit]
    
    # Update storage with selected articles only
    daily_articles[date] = selected
    
    daily_stats.append({
        'date': date,
        'available': len(articles),
        'sampled': len(selected)
    })

# Display stats
df_stats = pd.DataFrame(daily_stats)
print("Daily sampling summary:")
print(df_stats.to_string(index=False))

print("\n" + "=" * 80)
print(f"Total articles sampled: {df_stats['sampled'].sum():,}")
print(f"Average per day: {df_stats['sampled'].mean():.0f}")
print(f"Days with full quota ({daily_limit:,}): {(df_stats['sampled'] == daily_limit).sum()}/{len(target_dates)}")
print("=" * 80)


Selecting top-k articles per day (smallest hashes)...

Daily sampling summary:
      date  available  sampled
2016-09-01          0        0
2016-09-02          0        0
2016-09-03          0        0
2016-09-04          0        0
2016-09-05          0        0
2016-09-06          0        0
2016-09-07          0        0
2016-09-08          0        0
2016-09-09          0        0
2016-09-10          0        0
2016-09-11          0        0
2016-09-12          0        0
2016-09-13          0        0
2016-09-14          0        0
2016-09-15          0        0
2016-09-16          0        0
2016-09-17          0        0
2016-09-18          0        0
2016-09-19          0        0
2016-09-20          0        0
2016-09-21          0        0
2016-09-22          0        0
2016-09-23          0        0
2016-09-24          0        0
2016-09-25          0        0
2016-09-26          0        0
2016-09-27          0        0
2016-09-28          0        0
2016-09-29          0

## 7. Write Daily Parquet Files

In [None]:
print("\nWriting daily Parquet files...\n")

files_written = []

for date in tqdm(sorted(target_dates), desc="Writing files"):
    if date not in daily_articles or not daily_articles[date]:
        continue
    
    # Extract articles (discard hashes)
    articles = [article for _, article in daily_articles[date]]
    
    # Convert to DataFrame
    df = pd.DataFrame(articles)
    
    # Add date column (ensure it's just YYYY-MM-DD)
    df['date'] = date
    
    # Reorder columns: date first
    cols = ['date'] + [col for col in df.columns if col != 'date']
    df = df[cols]
    
    # Write to Parquet
    output_file = silver_news_dir / f"{date}.parquet"
    df.to_parquet(output_file, compression='snappy', index=False)
    files_written.append(output_file)

print("\n" + "=" * 80)
print(f"✓ Files written: {len(files_written)}")
print(f"✓ Output directory: {silver_news_dir.relative_to(workspace_root)}")
print("=" * 80)

## 8. Verify Sample File

In [None]:
if files_written:
    # Check a sample file
    sample_file = files_written[len(files_written)//2]
    
    print(f"Sample file: {sample_file.name}")
    print("=" * 80)
    
    df_sample = pd.read_parquet(sample_file)
    
    print(f"Shape: {df_sample.shape}")
    print(f"Columns: {df_sample.columns.tolist()}")
    print(f"\nFirst 3 rows:")
    print(df_sample[['date', 'title', 'url']].head(3).to_string(index=False))
    
    print("\n" + "=" * 80)
else:
    print("⚠ No files written")

## 9. Save Processing Metadata

In [None]:
# Save metadata
metadata = {
    'timestamp': datetime.now().isoformat(),
    'source': {
        'dataset': news_cfg['source']['hf_dataset'],
        'access_method': 'streaming',
        'year_subset': '2016'
    },
    'period': {
        'start': start_date.strftime('%Y-%m-%d'),
        'end': end_date.strftime('%Y-%m-%d'),
        'days': total_days
    },
    'filters': {
        'language': 'en'
    },
    'sampling': {
        'method': news_cfg['sampling']['method'],
        'hash_function': hash_function,
        'seed': seed,
        'daily_limit': daily_limit
    },
    'processing': {
        'articles_streamed': int(articles_seen),
        'articles_in_date_range': int(articles_in_range),
        'articles_english': int(articles_english),
        'total_sampled': int(df_stats['sampled'].sum())
    },
    'output': {
        'directory': str(silver_news_dir.relative_to(workspace_root)),
        'files_written': len(files_written),
        'dates_covered': sorted([f.stem for f in files_written]),
        'missing_dates': sorted(list(target_dates - set([f.stem for f in files_written])))
    },
    'daily_statistics': df_stats.to_dict('records')
}

metadata_file = silver_news_dir / 'processing_metadata.json'
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✓ Metadata saved: {metadata_file.relative_to(workspace_root)}")

## Summary

**CC-NEWS Sampling Complete!**

- ✓ Streamed HuggingFace dataset (memory efficient)
- ✓ Applied deterministic hash-based sampling
- ✓ Selected top-10k articles per day (smallest hashes)
- ✓ Written daily Parquet files
- ✓ Saved processing metadata

**Next Step:** 
- Filtering, deduplication, and enrichment (notebook 21)