# Incremental ETL Orchestration & Storage Monitoring

**What this notebook does**  
This notebook orchestrates a daily ETL run over GDELT data, performing for each date:  
1. Routing article processing to the correct MongoDB cluster based on date.  
2. Invoking `process_articles_for_day` to ingest news into S3 → Mongo pipeline.  
3. Logging inserted vs. duplicate counts and elapsed time.  
4. Checking MongoDB storage usage per cluster.  
5. Halting if the storage limit (512 MB) is approached.


# 1. Imports & Environment Setup  
Import required libraries and load environment variables.


In [None]:
from datetime import datetime, timedelta
from pymongo import MongoClient
import time
import os
from dotenv import load_dotenv
from gdelt_loader import process_articles_for_day

# Load environment variables
load_dotenv()


# 2. MongoDB Cluster Configuration  
Define three date‑sharded clusters with their connection URIs.


In [None]:
# MongoDB cluster configurations
CLUSTERS = {
    "cluster1": {
        "uri": os.getenv("MONGO_ATLAS_URI_1"),
        "start_date": "2024-01-01T00:00:00Z",
        "end_date":   "2024-03-01T23:59:59Z"
    },
    "cluster2": {
        "uri": os.getenv("MONGO_ATLAS_URI_2"),
        "start_date": "2024-03-02T00:00:00Z",
        "end_date":   "2024-04-26T23:59:59Z"
    },
    "cluster3": {
        "uri": os.getenv("MONGO_ATLAS_URI_3"),
        "start_date": "2024-04-27T00:00:00Z",
        "end_date":   "2024-06-30T23:59:59Z"
    }
}


# 3. Date‑Based Client Selection  
Return the MongoClient for the cluster covering `date_obj`.


In [None]:
def get_client_for_date(date_obj):
    """Get the appropriate MongoDB client based on the date"""
    for cluster in CLUSTERS.values():
        start = datetime.strptime(cluster["start_date"], "%Y-%m-%dT%H:%M:%SZ")
        end   = datetime.strptime(cluster["end_date"],   "%Y-%m-%dT%H:%M:%SZ")
        if start <= date_obj <= end:
            return MongoClient(cluster["uri"])
    # Default to first cluster
    return MongoClient(CLUSTERS["cluster1"]["uri"])


# 4. Storage Usage Check  
Run `dbstats` against the correct cluster to retrieve current storage size (MB).


In [None]:
def check_storage_limit(date_obj):
    """Check storage limit for the appropriate cluster based on date"""
    client = get_client_for_date(date_obj)
    db     = client["gdelt_news"]
    stats  = db.command("dbstats")
    storage_mb = stats["storageSize"] / (1024 * 1024)
    client.close()
    return storage_mb


# 5. Daily ETL Loop  
Iterate from `start_date` to `end_date`, processing each day’s files, logging results, and checking storage.


In [None]:
def load_articles_for_date_range(bucket, start_date, end_date):
    current_date = start_date

    while current_date <= end_date:
        date_str = current_date.strftime("%Y%m%d")
        print(f"\n🚀 Processing {date_str}...")
        day_start = time.time()

        inserted, duplicates = process_articles_for_day(bucket, date_str)

        elapsed = round((time.time() - day_start) / 60, 2)
        print(f"✅ Finished {date_str}: Inserted {inserted}, Skipped {duplicates}, Time: {elapsed} min")

        # Append to ETL log
        with open("etl_log.txt", "a") as log_file:
            log_file.write(f"{date_str}: Inserted={inserted}, Duplicates={duplicates}, Time={elapsed} min\n")

        # Monitor storage
        storage_mb = check_storage_limit(current_date)
        print(f"📦 MongoDB storage used: {storage_mb:.2f} MB")
        if storage_mb > 490:
            print("🛑 WARNING: Approaching 512 MB limit—stopping ETL.")
            break

        current_date += timedelta(days=1)


# 6. Main Execution  
Set bucket and date range, then run the ETL loop.


In [None]:
if __name__ == "__main__":
    bucket = "gdelt-peace-speech"
    start  = datetime.strptime("20240111", "%Y%m%d")
    end    = datetime.strptime("20241031", "%Y%m%d")

    load_articles_for_date_range(bucket, start, end)
