### Imports and Configs

In [None]:
from pathlib import Path
import pandas as pd
from datetime import datetime
from src.gdelt_pipeline.config import load_config
from src.gdelt_pipeline.utils import daterange_chunks, ensure_dir
from src.gdelt_pipeline.gdelt_api import events_query
from src.gdelt_pipeline.io import save_csv

settings, paths = load_config()
countries = pd.read_csv(paths.root / "config" / "countries.csv")
countries.head()

#### Helpers

In [None]:
def dtfmt(dt):
    return dt.strftime("%Y%m%d%H%M%S")

In [None]:
from time import sleep

ensure_dir(paths.artlist_dir)

for _, row in countries.iterrows():
    country = row["country_name"]
    print(f"=== {country} ===")
    for s, e in daterange_chunks(settings.startdate, settings.enddate, settings.chunk_days):
        df = events_query(
            query=f'({settings.query_template}) AND {country}',
            startdatetime=dtfmt(s.replace(hour=0, minute=0, second=0)),
            enddatetime=dtfmt(e.replace(hour=23, minute=59, second=59)),
            mode=settings.mode,
            maxrecords=settings.maxrecords,
            fmt=settings.format,
            timeout=settings.timeout_seconds
        )
        if df is None or df.empty:
            continue
        # Normalize common column names (GDELT artlist returns e.g., url,title,domain,language,â€¦)
        cols = {c: c.lower() for c in df.columns}
        df = df.rename(columns=cols)
        # Write one file per chunk
        out = paths.artlist_dir / f"{row['iso3']}_{s.date()}_{e.date()}.csv"
        save_csv(df, out)
        sleep(settings.sleep_seconds_between_calls)


In [None]:
from glob import glob

files = glob(str(paths.artlist_dir / "*.csv"))
dfs = [pd.read_csv(f) for f in files]
art = pd.concat(dfs, ignore_index=True)
# Standardize URL column (GDELT artlist returns `url`)
art = art.rename(columns={"url": "url"})
out = paths.interim_dir / "artlist_all.csv"
save_csv(art, out)
len(art)
