# Fetch Metadata from GDELT

This notebook demonstrates how to query and fetch metadata from the [GDELT Project](https://www.gdeltproject.org/).  
It includes simple steps to define keywords, time periods, and locations, and then retrieve structured metadata for further analysis.  

## How to use
1. Run the setup cells to load the required libraries.  
2. Set your search parameters (keywords, dates, and locations).  
3. Execute the fetch function to retrieve metadata from GDELT.  
4. Save or analyze the results directly within the notebook.  

> Tip: You can also run this notebook on [Google Colab](https://colab.research.google.com/) without any local setup.  


In [None]:
# ========================================
#  SETUP: Google Drive & Installations
# ========================================
from google.colab import drive
drive.mount('/content/drive')


In [5]:
!pip install gdeltdoc



In [6]:
"""
gdelt_fetch_exclusive_end.py
----------------------------
A script to fetch news articles from the GDELT Doc API based on:
- A keyword (e.g., "Climate Change")
- A specific country filter (e.g., "UK")
- A date range (start_date -> end_date, inclusive)
Uses an exclusive daily fetch approach (end_date = start_date + 1 day)

Features:
 - Logging + print statements for day-by-day fetch info
 - Rate limiting (10s between each request, +50s after every 100 requests)
 - Retry logic for transient API failures
 - Monthly partial saves & final save (all with "gdelt" prefix)
 - Graceful handling of 0-article days
"""

import logging
import time
import shutil
from datetime import date, timedelta
import pandas as pd
from gdeltdoc import GdeltDoc, Filters
import re

# -------------------------------------------------------------------------------
# USER-CONFIGURABLE PARAMETERS
# -------------------------------------------------------------------------------
KEYWORD = "volleyball"
COUNTRY = "Italy"
START_DATE = date(2025, 9, 1)   # inclusive start date
END_DATE = date(2025, 9, 10)    # inclusive end date

MAX_RECORDS_PER_DAY = 250
GDRIVE_DEST = "/content/drive/MyDrive/sport/"

# Rate-limit constants
REQUEST_PAUSE_SECONDS = 1
LONG_PAUSE_AFTER_REQUESTS = 1
LONG_PAUSE_DURATION = 1

# Retry logic
MAX_RETRIES = 2
RETRY_SLEEP_SECONDS = 2

# -------------------------------------------------------------------------------
def generate_date_list(start_d: date, end_d: date):
    """
    Generates a list of datetime.date objects for each day in the inclusive range.
    """
    date_list = []
    current = start_d
    while current <= end_d:
        date_list.append(current)
        current += timedelta(days=1)
    return date_list

def clean_filename_part(text: str) -> str:
    """
    Make a string safe for filenames (no spaces or special chars).
    """
    text = text.strip()
    text = text.replace(" ", "_")
    text = re.sub(r"[^\w\-_]+", "", text)
    return text

def fetch_articles_for_date(
    gd: GdeltDoc,
    keyword: str,
    country: str,
    start_str: str,
    end_str: str,
    max_records: int
) -> pd.DataFrame or None:
    """
    Fetch articles from GDELT for [start_str, end_str),
    retrying up to MAX_RETRIES times on transient errors.
    """
    filters = Filters(
        keyword=keyword,
        start_date=start_str,
        end_date=end_str,
        num_records=max_records,
        country=country
    )

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            articles_df = gd.article_search(filters)
            return articles_df
        except Exception as e:
            logging.error(
                f"[fetch_articles_for_date] Attempt {attempt}/{MAX_RETRIES} failed. "
                f"Date range: {start_str} -> {end_str}, keyword='{keyword}', country='{country}'\n"
                f"Error: {e}"
            )
            if attempt < MAX_RETRIES:
                logging.info(f"Retrying after {RETRY_SLEEP_SECONDS}s...")
                print(f"Retrying after {RETRY_SLEEP_SECONDS}s...")
                time.sleep(RETRY_SLEEP_SECONDS)
            else:
                logging.error(
                    f"All {MAX_RETRIES} attempts failed for {start_str} -> {end_str}. Skipping."
                )
                print(f"All {MAX_RETRIES} attempts failed for {start_str} -> {end_str}. Skipping.")
                return None
    return None

def save_to_drive(local_filename: str, drive_folder: str):
    """
    Copy file to Google Drive folder using shutil.copy.
    """
    try:
        shutil.copy(local_filename, drive_folder)
        logging.info(f"File '{local_filename}' copied to '{drive_folder}'.")
        print(f"File '{local_filename}' copied to Google Drive folder: {drive_folder}")
    except Exception as e:
        logging.error(f"Error copying file to Google Drive: {e}")
        print(f"Error copying file to Google Drive: {e}")

def main():
    """
    Main function orchestrates:
     - Generate daily date list
     - Fetch data day-by-day, with logging & print statements
     - Rate-limit (10s between calls, +50s every 100 requests)
     - Save partial monthly CSV + final CSV to local, and copy to GDrive
    """

    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )

    # Build short "safe" versions for filenames
    safe_keyword = clean_filename_part(KEYWORD)
    safe_country = clean_filename_part(COUNTRY)
    date_range_label = f"{START_DATE.strftime('%Y%m%d')}_to_{END_DATE.strftime('%Y%m%d')}"

    # Final CSV name, including "gdelt" prefix
    final_csv_filename = f"gdelt_{safe_keyword}_{date_range_label}_{safe_country}.csv"

    # Logging + print
    logging.info("Starting GDELT Fetch Script...")
    print("Starting GDELT Fetch Script...")

    gd = GdeltDoc()
    dates_to_fetch = generate_date_list(START_DATE, END_DATE)
    total_days = len(dates_to_fetch)

    logging.info(
        f"Fetching data from {START_DATE} to {END_DATE} (inclusive), "
        f"keyword='{KEYWORD}', country='{COUNTRY}'. Total days: {total_days}"
    )
    print(
        f"Fetching data from {START_DATE} to {END_DATE} (inclusive), "
        f"keyword='{KEYWORD}', country='{COUNTRY}'. Total days: {total_days}"
    )

    all_daily_data = []
    monthly_data = []
    request_count = 0
    current_month = START_DATE.month

    # Loop over dates
    for idx, single_date in enumerate(dates_to_fetch, start=1):
        start_str = single_date.strftime("%Y-%m-%d")
        next_day = single_date + timedelta(days=1)
        end_str = next_day.strftime("%Y-%m-%d")

        logging.info(f"[Day {idx}/{total_days}] Querying {start_str} -> {end_str} (exclusive).")
        print(f"[Day {idx}/{total_days}] Querying {start_str} -> {end_str} (exclusive).")

        # Fetch
        day_df = fetch_articles_for_date(
            gd=gd,
            keyword=KEYWORD,
            country=COUNTRY,
            start_str=start_str,
            end_str=end_str,
            max_records=MAX_RECORDS_PER_DAY
        )
        request_count += 1

        # Evaluate result
        if day_df is None or day_df.empty:
            msg = f"No data returned for {start_str}. (Day {idx} of {total_days})"
            logging.warning(msg)
            print(msg)
        else:
            count_msg = f"Day {start_str}: fetched {len(day_df)} articles."
            logging.info(count_msg)
            print(count_msg)
            all_daily_data.append(day_df)
            monthly_data.append(day_df)

        # Rate-limit pause
        logging.info(f"Pausing {REQUEST_PAUSE_SECONDS}s to respect rate limits...")
        print(f"Pausing {REQUEST_PAUSE_SECONDS}s...")
        time.sleep(REQUEST_PAUSE_SECONDS)

        if request_count % LONG_PAUSE_AFTER_REQUESTS == 0:
            extra_pause_msg = (
                f"Reached {request_count} requests. Extra pause {LONG_PAUSE_DURATION}s..."
            )
            logging.info(extra_pause_msg)
            print(extra_pause_msg)
            time.sleep(LONG_PAUSE_DURATION)

        # Check if we crossed a month boundary or are at the end
        if (next_day.month != current_month) or (single_date == END_DATE):
            if monthly_data:
                # Combine monthly portion
                month_df = pd.concat(monthly_data, ignore_index=True)
                before_dedup = len(month_df)
                month_df.drop_duplicates(subset=["url", "seendate"], inplace=True)
                after_dedup = len(month_df)

                dedup_msg = (
                    f"Monthly dedup for month={current_month}: "
                    f"removed {before_dedup - after_dedup}, final count={after_dedup}."
                )
                logging.info(dedup_msg)
                print(dedup_msg)

                partial_filename = (
                    f"gdelt_{safe_keyword}_"
                    f"{single_date.strftime('%Y')}_"
                    f"{str(current_month).zfill(2)}_"
                    f"{safe_country}.csv"
                )
                month_df.to_csv(partial_filename, index=False)
                logging.info(f"Saved monthly file '{partial_filename}'.")
                print(f"Saved monthly file '{partial_filename}'.")

                save_to_drive(partial_filename, GDRIVE_DEST)

                monthly_data = []

            current_month = next_day.month

    if not all_daily_data:
        warning_msg = "No articles fetched at all for the specified range."
        logging.warning(warning_msg)
        print(warning_msg)
        return

    # Combine final
    combined_df = pd.concat(all_daily_data, ignore_index=True)
    pre_final = len(combined_df)
    combined_df.drop_duplicates(subset=["url", "seendate"], inplace=True)
    post_final = len(combined_df)
    final_dupes_msg = (
        f"Final dedup removed {pre_final - post_final}. "
        f"Final total: {post_final} articles."
    )
    logging.info(final_dupes_msg)
    print(final_dupes_msg)

    # Save final CSV
    combined_df.to_csv(final_csv_filename, index=False)
    logging.info(f"Saved final CSV to '{final_csv_filename}'.")
    print(f"Saved final CSV to '{final_csv_filename}'.")

    save_to_drive(final_csv_filename, GDRIVE_DEST)

    done_msg = "All done!"
    logging.info(done_msg)
    print(done_msg)

# -------------------------------------------------------------------------------
if __name__ == "__main__":
    main()


Starting GDELT Fetch Script...
Fetching data from 2025-09-01 to 2025-09-10 (inclusive), keyword='volleyball', country='Italy'. Total days: 10
[Day 1/10] Querying 2025-09-01 -> 2025-09-02 (exclusive).
Day 2025-09-01: fetched 31 articles.
Pausing 1s...
Reached 1 requests. Extra pause 1s...
[Day 2/10] Querying 2025-09-02 -> 2025-09-03 (exclusive).
Day 2025-09-02: fetched 27 articles.
Pausing 1s...
Reached 2 requests. Extra pause 1s...
[Day 3/10] Querying 2025-09-03 -> 2025-09-04 (exclusive).
Day 2025-09-03: fetched 36 articles.
Pausing 1s...
Reached 3 requests. Extra pause 1s...
[Day 4/10] Querying 2025-09-04 -> 2025-09-05 (exclusive).
Day 2025-09-04: fetched 49 articles.
Pausing 1s...
Reached 4 requests. Extra pause 1s...
[Day 5/10] Querying 2025-09-05 -> 2025-09-06 (exclusive).
Day 2025-09-05: fetched 38 articles.
Pausing 1s...
Reached 5 requests. Extra pause 1s...
[Day 6/10] Querying 2025-09-06 -> 2025-09-07 (exclusive).
Day 2025-09-06: fetched 94 articles.
Pausing 1s...
Reached 6 requ