In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Configuration
NUM_RECORDS = 50000  # Adjust as needed for testing
OUTPUT_PATH = "babel_briefings_synthetic.csv"

# Sample data
LANGUAGES = ["en", "es", "fr", "de", "pt", "it", "nl", "pl", "ru", "ar"]
SOURCES = ["Reuters", "BBC", "CNN", "Al Jazeera", "DW", "France24", "EFE", "ANSA", "Xinhua", "AFP"]
CATEGORIES = ["politics", "business", "technology", "sports", "entertainment", "health", "science", "world"]

HEADLINES = {
    "en": ["Global markets surge amid economic optimism", "New climate agreement reached at summit", "Tech giant announces major acquisition", "Sports team wins championship title", "Scientists discover high breakthrough"],
    "es": ["Los mercados globales suben en medio del optimismo", "Nuevo acuerdo climático alcanzado en cumbre", "Gigante tecnológico anuncia adquisición importante"],
    "fr": ["Les marchés mondiaux en hausse", "Nouvel accord climatique atteint au sommet", "Le géant de la technologie annonce une acquisition majeure"],
    "de": ["Globale Märkte steigen bei wirtschaftlichem Optimismus", "Neues Klimaabkommen auf Gipfel erreicht", "Tech-Riese kündigt große Übernahme an"],
}

def generate_synthetic_data(num_records: int) -> pd.DataFrame:
    records = []
    start_date = datetime(2021, 1, 1)
    end_date = datetime(2021, 12, 31)
    
    for i in range(num_records):
        lang = random.choice(LANGUAGES)
        headlines = HEADLINES.get(lang, HEADLINES["en"])
        
        # Random timestamp in 2021
        random_days = random.randint(0, (end_date - start_date).days)
        random_seconds = random.randint(0, 86400)
        published = start_date + timedelta(days=random_days, seconds=random_seconds)
        collected = published + timedelta(hours=random.randint(1, 24))
        
        records.append({
            "id": f"article_{i:08d}",
            "title": random.choice(headlines) + f" - Story {i}",
            "headline": random.choice(headlines),
            "source": random.choice(SOURCES),
            "language": lang,
            "published_at": published.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "collected_at": collected.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "url": f"https://news.example.com/article/{i}",
            "category": random.choice(CATEGORIES),
        })
    
    return pd.DataFrame(records)

if __name__ == "__main__":
    print(f"Generating {NUM_RECORDS} synthetic records...")
    df = generate_synthetic_data(NUM_RECORDS)
    df.to_csv(OUTPUT_PATH, index=False)
    print(f"Saved to {OUTPUT_PATH}")
    print(f"\nSample:\n{df.head()}")
    print(f"\nDistribution by month:")
    df['month'] = pd.to_datetime(df['published_at']).dt.month
    print(df['month'].value_counts().sort_index())