Cell 1: Load environment variables

In [None]:
import os
from dotenv import load_dotenv
import pymongo

# Load .env file from project root
load_dotenv()

# Read the MongoDB connection string (plain string, not JSON)
MONGODB_URI = os.getenv("MONGODB_URI")

if not MONGODB_URI:
    raise ValueError("❌ MONGODB_URI not found in .env")

client = pymongo.MongoClient(MONGODB_URI)

db = client["prod"]
collection = db["PRJ-16"]


In [None]:
collection.estimated_document_count()

In [None]:
# === Cell 4: Compute previous ISO week range (UTC safe) ===

from datetime import datetime, date, timedelta, UTC

# Current date in UTC
today = datetime.now(UTC).date()

# ISO year/week of today
iso_year, iso_week, _ = today.isocalendar()

# Previous ISO week
prev_week = iso_week - 1
prev_year = iso_year

# Handle wrap to previous year
if prev_week == 0:
    prev_year -= 1
    prev_week = date(prev_year, 12, 28).isocalendar()[1]

# Start of previous week (Monday)
start_prev_week = date.fromisocalendar(prev_year, prev_week, 1)

# Start of current week (Monday)
start_current_week = date.fromisocalendar(iso_year, iso_week, 1)

# Convert to timezone-aware datetimes
start_dt = datetime.combine(start_prev_week, datetime.min.time(), tzinfo=UTC)
end_dt   = datetime.combine(start_current_week, datetime.min.time(), tzinfo=UTC)

print(f"Today:              {today}")
print(f"Current ISO week:   {iso_year}-W{iso_week}")
print(f"Previous ISO week:  {prev_year}-W{prev_week}")
print()
print(f"Week start:         {start_dt}")
print(f"Week end:           {end_dt}  <-- start of current week")


In [None]:
# === Updated Cell 5: Daily fetch with custom weekly naming ===

from datetime import datetime, timedelta, UTC
import json

def fetch_one_day(collection, day, iso_year, iso_week, day_index):
    """
    Fetch one day and save as JSONL:
    Format: {year}W{week}_{day_index}.jsonl
    Example: 2025W49_1.jsonl
    """

    # Define time boundaries as ISO8601 strings
    day_start_iso = f"{day}T00:00:00"
    next_day_iso  = (day + timedelta(days=1)).strftime("%Y-%m-%dT00:00:00")

    query = {
        "time.datetime": {
            "$gte": day_start_iso,
            "$lt": next_day_iso
        }
    }

    filename = f"{iso_year}W{iso_week}_{day_index}.jsonl"

    print(f"\nFetching {day} → {filename}")

    cursor = collection.find(query, batch_size=5000)

    count = 0
    with open(filename, "w") as f:
        for doc in cursor:
            f.write(json.dumps(doc, default=str) + "\n")
            count += 1

    print(f"Saved {count} documents → {filename}")
    return filename, count


In [None]:
monday_date = start_dt.date()   # datetime.date
iso_year = prev_year
iso_week = prev_week
day_idx  = 7

monday_file, monday_count = fetch_one_day(
    collection,
    monday_date,
    iso_year,
    iso_week,
    day_idx
)


In [None]:
# === Inspect sample document ===
sample = collection.find_one()
print(sample)
