In [1]:
!pip install pymongo pyarrow python-dateutil




In [8]:
import os
USER = "mongouser"
PWD  = "mongopassword"
HOST = "mongodb"
DB   = "taxi_logs"   # your target DB for writes

# Try admin as the auth DB (common when the root user was created via MONGO_INITDB_* envs)
os.environ["MONGO_URL"] = f"mongodb://{USER}:{PWD}@{HOST}:27017/{DB}?authSource=admin"
print(os.environ["MONGO_URL"])

mongodb://mongouser:mongopassword@mongodb:27017/taxi_logs?authSource=admin


In [9]:
from pymongo import MongoClient
client = MongoClient(os.environ["MONGO_URL"])
print(client.admin.command("ping"))  # should print {'ok': 1.0}

{'ok': 1.0}


In [None]:
# parquet_to_mongo.py
import os
from datetime import timezone
from pymongo import MongoClient, InsertOne
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.compute as pc
from dateutil import parser as dtparse
import re

os.environ["PARQUET_ROOT"] = "/home/jovyan/work/data/nyc-taxi/partitioned/year=2019"

# If Mongo has no auth:
os.environ["MONGO_URL"] = "mongodb://mongouser:mongopassword@mongodb:27017/taxi_logs?authSource=admin"


MONGO_URL = os.getenv("MONGO_URL", "mongodb://localhost:27017/")
PARQUET_ROOT = os.getenv("PARQUET_ROOT", "/data/taxi")  # e.g., mounted host path or HDFS copy to local
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "5000"))

client = MongoClient(MONGO_URL)
db = client["taxi_logs"]
col = db["trips"]

# Define a tiny transformer from an Arrow batch to list[dict]
def batch_to_docs(batch: pa.RecordBatch, partition_values: dict[str, str] | None = None):
    # Columns expected in NYC taxi parquet
    cols = {name: batch.column(i) for i, name in enumerate(batch.schema.names)}
    # Some datasets have timestamps as strings; some as timestamp types. Normalize to ISO strings then to datetimes.
    pick_col = cols.get("tpep_pickup_datetime")
    drop_col = cols.get("tpep_dropoff_datetime")
    pu_col   = cols.get("PULocationID")
    do_col   = cols.get("DOLocationID")
    pc_col   = cols.get("passenger_count")
    fare_col = cols.get("fare_amount")

    n = batch.num_rows
    docs = []
    for i in range(n):
        # Extract values safely (handle nulls)
        # New:
        def get_value(col, i):
            # Returns None automatically for nulls
            if col is None:
                return None
            return col[i].as_py()
        
        def to_dt_utc(x):
            # Normalize to timezone-aware UTC
            if x is None:
                return None
            # x can be a Python datetime (from Arrow) or a string
            from datetime import datetime, timezone
            if isinstance(x, str):
                from dateutil import parser as dtparse
                dt = dtparse.parse(x)
            else:
                dt = x
            if getattr(dt, "tzinfo", None) is None:
                dt = dt.replace(tzinfo=timezone.utc)
            else:
                dt = dt.astimezone(timezone.utc)
            return dt

        doc = {
            "pickup": {
                "time": to_dt_utc(get_value(pick_col, i)),
                "location_id": get_value(pu_col, i)
            },
            "dropoff": {
                "time": to_dt_utc(get_value(drop_col, i)),
                "location_id": get_value(do_col, i)
            },
            "passenger_count": get_value(pc_col, i),
            "fare_amount": get_value(fare_col, i),
        }


        # Optional: attach partition info for traceability (e.g., year/month)
        if partition_values:
            doc.setdefault("meta", {})["partition"] = partition_values

        docs.append(doc)
    return docs

def main():
    dataset = ds.dataset(PARQUET_ROOT, format="parquet", partitioning="hive")  # understands year=2019/month=01/...

    # Iterate by fragments (files/partitions) → smaller Arrow tables → record batches
    for fragment in dataset.get_fragments():
        # Extract year/month from a path like .../year=2019/month=01/....
        m = re.search(r"year=(\d{4})/month=(\d{2})", fragment.path)
        part_vals = {"year": m.group(1), "month": m.group(2)} if m else None
    
        table = fragment.to_table()  # or .to_table(columns=[...]) to project fewer cols
        for batch in table.to_batches(max_chunksize=BATCH_SIZE):
            docs = batch_to_docs(batch, partition_values=part_vals)
            if docs:
                col.bulk_write([InsertOne(d) for d in docs], ordered=False)
                print(f"Inserted {len(docs)} docs from {fragment.path}")

if __name__ == "__main__":
    main()


In [4]:
# import os, glob, itertools

# BASES = [
#     "/home/jovyan/work/data",
#     "/data/taxi",                          # in case you also mounted this
# ]

# print("Checking common bases…")
# for b in BASES:
#     print(b, "exists?" , os.path.exists(b))

# # Find any parquet files (limit output)
# candidates = []
# for base in BASES:
#     for p in itertools.islice(glob.iglob(base + "/**/*.parquet", recursive=True), 50):
#         candidates.append(p)
# print(f"\nFound {len(candidates)} parquet files (showing up to 50):")
# for p in candidates[:50]:
#     print(" -", p)


In [7]:
#!/usr/bin/env python3
from datetime import datetime, timedelta, timezone
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError, AutoReconnect
import time

# ------------ toggles ------------
SKIP_CREATE_TIME_INDEX = True  # build at the very end instead
USE_MANUAL_RANGE = True        # start small to see output quickly
MANUAL_MIN = datetime(2019, 1, 1, tzinfo=timezone.utc)
MANUAL_MAX = datetime(2019, 1, 2, tzinfo=timezone.utc)  # just 1 day
# MANUAL_MIN = datetime(2019, 1, 1, tzinfo=timezone.utc)
# MANUAL_MAX = datetime(2019, 2, 1, tzinfo=timezone.utc)  # one month
CREATE_UNIQUE_INDEX = True
# ---------------------------------

MONGO_URL = "mongodb://mongouser:mongopassword@mongodb:27017/taxi_logs?authSource=admin"
DB_NAME, COLL_NAME = "taxi_logs", "trips"

print("Connecting…", flush=True)
client = MongoClient(
    MONGO_URL,
    serverSelectionTimeoutMS=60000,
    connectTimeoutMS=60000,
    socketTimeoutMS=0,
    retryWrites=True,
    retryReads=True
)
col = client[DB_NAME][COLL_NAME]

def with_retries(fn, what, attempts=5):
    for i in range(attempts):
        try:
            return fn()
        except AutoReconnect as e:
            sleep = 1.5 * (2 ** i)
            print(f"⚠️  AutoReconnect during {what}. Retry {i+1}/{attempts} in {sleep:.1f}s …", flush=True)
            time.sleep(sleep)
    # last try without catching to surface the error
    return fn()

def utc(dt):  # ensure tz-aware
    return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)

def month_bounds(start_dt, end_dt):
    start = utc(datetime(start_dt.year, start_dt.month, 1))
    cur = start
    while cur < end_dt:
        nxt = utc(datetime(cur.year + 1, 1, 1)) if cur.month == 12 else utc(datetime(cur.year, cur.month + 1, 1))
        yield cur, min(nxt, end_dt)
        cur = nxt

def get_time_range():
    print("Computing min/max pickup.time (can be slow on large collections)…", flush=True)
    pipe = [
        {"$match": {"pickup.time": {"$type": "date"}}},
        {"$group": {"_id": None, "minT": {"$min": "$pickup.time"}, "maxT": {"$max": "$pickup.time"}}}
    ]
    agg = with_retries(lambda: list(col.aggregate(pipe, allowDiskUse=True)), "aggregate(min/max)")
    if not agg or agg[0]["minT"] is None or agg[0]["maxT"] is None:
        raise SystemExit("No documents with pickup.time as a date were found.")
    # make max exclusive by adding 1 day to ensure final month closes
    return agg[0]["minT"], agg[0]["maxT"] + timedelta(days=1)

def count_month(start, end):
    return with_retries(lambda: col.count_documents({"pickup.time": {"$gte": start, "$lt": end}}),
                        "count_documents(month)")

def count_missing_time_bucket(start, end):
    q = {"pickup.time": {"$gte": start, "$lt": end, "$type": "date"},
         "meta.time_bucket": {"$exists": False}}
    return with_retries(lambda: col.count_documents(q), "count TB_missing")

def count_missing_trip_id(start, end):
    q = {"pickup.time": {"$gte": start, "$lt": end}, "trip_id": {"$exists": False}}
    return with_retries(lambda: col.count_documents(q), "count ID_missing")

def backfill_time_bucket(start, end):
    filt = {"pickup.time": {"$gte": start, "$lt": end, "$type": "date"},
            "meta.time_bucket": {"$exists": False}}
    t0 = time.perf_counter()
    res = with_retries(lambda: col.update_many(
        filt,
        [{"$set": {"meta.time_bucket": {"$dateTrunc": {"date": "$pickup.time", "unit": "hour"}}}}]
    ), "update_many(time_bucket)")
    return res.matched_count, res.modified_count, time.perf_counter() - t0

def backfill_trip_id(start, end):
    filt = {"pickup.time": {"$gte": start, "$lt": end}, "trip_id": {"$exists": False}}
    t0 = time.perf_counter()
    res = with_retries(lambda: col.update_many(
        filt,
        [{"$set": {"trip_id": {
            "$concat": [
                {"$dateToString": {"date": "$pickup.time", "format": "%Y-%m-%dT%H:%M:%S.%LZ", "timezone": "UTC"}},
                ":", {"$toString": {"$ifNull": ["$pickup.location_id", "na"]}},
                ":", {"$toString": {"$ifNull": ["$dropoff.location_id", "na"]}},
                ":", {"$toString": {"$ifNull": ["$fare_amount", "na"]}}
            ]}}}]
    ), "update_many(trip_id)")
    return res.matched_count, res.modified_count, time.perf_counter() - t0

def main():
    # 0) (Optional) defer index until the end so we see output sooner
    if not SKIP_CREATE_TIME_INDEX:
        print("Creating index on pickup.time …", flush=True)
        with_retries(lambda: col.create_index([("pickup.time", 1)], name="idx_pickup_time"),
                     "create_index(pickup.time)")
        print("Index on pickup.time created.", flush=True)

    # 1) Determine window(s)
    if USE_MANUAL_RANGE:
        minT, maxT = MANUAL_MIN, MANUAL_MAX
        print(f"Using manual window: {minT.isoformat()} → {maxT.isoformat()}", flush=True)
    else:
        minT, maxT = get_time_range()
        print(f"Discovered window:   {minT.isoformat()} → {maxT.isoformat()}", flush=True)

    print("\n=== Month-by-month Backfill (by pickup.time) ===", flush=True)
    print("time_bucket → rounds pickup.time to the hour for faster grouping", flush=True)
    print("trip_id     → synthetic unique key to prevent duplicates\n", flush=True)
    print(f"{'MonthStart(UTC)':<20}{'Total':>10} | "
          f"{'TB_missing':>11} -> {'TB_mod':>7} ({'s':>5}) | "
          f"{'ID_missing':>11} -> {'ID_mod':>7} ({'s':>5})", flush=True)

    grand_total = grand_tb_mod = grand_id_mod = 0
    job_t0 = time.perf_counter()

    for mstart, mend in month_bounds(minT, maxT):
        total = count_month(mstart, mend)
        grand_total += total
        if total == 0:
            print(f"{mstart.isoformat():<20}{0:>10} | {0:>11} -> {0:>7} ({0:>5}) | {0:>11} -> {0:>7} ({0:>5})",
                  flush=True)
            continue

        tb_miss = count_missing_time_bucket(mstart, mend)
        tb_matched, tb_mod, tb_s = backfill_time_bucket(mstart, mend)

        id_miss = count_missing_trip_id(mstart, mend)
        id_matched, id_mod, id_s = backfill_trip_id(mstart, mend)

        grand_tb_mod += tb_mod
        grand_id_mod += id_mod

        print(f"{mstart.isoformat():<20}{total:>10} | "
              f"{tb_miss:>11} -> {tb_mod:>7} ({tb_s:>5.1f}) | "
              f"{id_miss:>11} -> {id_mod:>7} ({id_s:>5.1f})",
              flush=True)

    # 2) Build unique index at the end (with retry)
    if CREATE_UNIQUE_INDEX:
        print("\nCreating partial unique index on trip_id …", flush=True)
        try:
            with_retries(lambda: col.create_index(
                [("trip_id", 1)],
                unique=True,
                partialFilterExpression={"trip_id": {"$type": "string"}},
                name="uniq_trip_id_partial"
            ), "create_index(trip_id unique)")
            print("Unique index created on trip_id (partial).", flush=True)
        except DuplicateKeyError as e:
            print("⚠️  DuplicateKeyError while creating unique index. Investigate duplicates.", flush=True)
            print(e, flush=True)

    elapsed = time.perf_counter() - job_t0
    print("\n=== Summary ===", flush=True)
    print(f"Docs scanned (sum of month totals): {grand_total:,}", flush=True)
    print(f"time_bucket modified: {grand_tb_mod:,}", flush=True)
    print(f"trip_id modified:    {grand_id_mod:,}", flush=True)
    print(f"Elapsed: {elapsed/60:.1f} minutes", flush=True)

main()


Connecting…
Using manual window: 2019-01-01T00:00:00+00:00 → 2019-01-02T00:00:00+00:00

=== Month-by-month Backfill (by pickup.time) ===
time_bucket → rounds pickup.time to the hour for faster grouping
trip_id     → synthetic unique key to prevent duplicates

MonthStart(UTC)          Total |  TB_missing ->  TB_mod (    s) |  ID_missing ->  ID_mod (    s)
2019-01-01T00:00:00+00:00    189432 |           0 ->       0 (  0.4) |      189432 ->  189432 (  7.4)

Creating partial unique index on trip_id …
⚠️  DuplicateKeyError while creating unique index. Investigate duplicates.
Index build failed: 26362b9a-1aac-40ee-82e7-93011b855bee: Collection taxi_logs.trips ( 81f13904-2100-4505-8de9-a4728b198a32 ) :: caused by :: E11000 duplicate key error collection: taxi_logs.trips index: uniq_trip_id_partial dup key: { trip_id: "2019-01-01T00:25:12.000Z:141:263:5" }, full error: {'ok': 0.0, 'errmsg': 'Index build failed: 26362b9a-1aac-40ee-82e7-93011b855bee: Collection taxi_logs.trips ( 81f13904-2100-4

In [16]:
from pymongo import MongoClient, ASCENDING, GEOSPHERE
import time, sys, threading

MONGO_URI = "mongodb://mongouser:mongopassword@mongodb:27017/taxi_logs?authSource=admin"
DB_NAME = "taxi_logs"
COLL_NAME = "trips"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
c = db[COLL_NAME]

# ---- UI helpers ----
def draw_bar(done, total, label):
    if total and total > 0:
        frac = min(1.0, float(done) / float(total))
        bar_len = 30
        filled = int(bar_len * frac)
        bar = "█" * filled + "-" * (bar_len - filled)
        sys.stdout.write(f"\r[{bar}] {done}/{total}  {label}")
    else:
        # Unknown total => show count only
        sys.stdout.write(f"\r… {done} (scanned)  {label}")
    sys.stdout.flush()

def spinner_frame(i):
    return ["|","/","-","\\"][i % 4]

def poll_index_progress(coll, label, stop_event):
    """
    Poll $currentOp for the running createIndexes on this collection.
    If progress is present, render bar; otherwise render spinner.
    """
    i = 0
    dbname = coll.database.name
    collname = coll.name

    # Build a small helper to fetch currentOp rows for this createIndexes
    def get_ops():
        # Use aggregation $currentOp (works across versions, avoids deprecated currentOp)
        # Filter for this DB's $cmd namespace and createIndexes on our collection
        pipeline = [
            {"$currentOp": {"allUsers": True, "idleConnections": False}},
            {"$match": {
                "ns": f"{dbname}.$cmd",
                "command.createIndexes": collname
            }}
        ]
        try:
            return list(client.admin.aggregate(pipeline))
        except Exception:
            return []

    last_output = ""
    while not stop_event.is_set():
        ops = get_ops()
        rendered = False
        for op in ops:
            # Try to read progress.{done,total} if available
            prog = op.get("progress") or {}
            done = prog.get("done")
            total = prog.get("total")
            if done is not None:
                draw_bar(done, total, f"building {label}")
                rendered = True
                break

        if not rendered:
            # No progress exposed -> spinner heartbeat
            sys.stdout.write("\r" + spinner_frame(i) + f" building {label}")
            sys.stdout.flush()
            i += 1

        time.sleep(0.15)

    # Clear the line when stopping
    sys.stdout.write("\r" + " " * 80 + "\r")
    sys.stdout.flush()

# ---- Index build driver ----
indexes = [
    ([("pickup.time", ASCENDING)], "pickup.time"),
    ([("dropoff.time", ASCENDING)], "dropoff.time"),
    ([("pickup.location_id", ASCENDING), ("pickup.time", ASCENDING)], "pickup.location_id + pickup.time"),
    ([("dropoff.location_id", ASCENDING), ("dropoff.time", ASCENDING)], "dropoff.location_id + dropoff.time"),
    ([("meta.time_bucket", ASCENDING)], "meta.time_bucket"),
    ([("pickup.loc", GEOSPHERE)], "pickup.loc (geospatial)"),
]

total = len(indexes)
for i, (spec, label) in enumerate(indexes, start=1):
    stop_event = threading.Event()
    t = threading.Thread(target=poll_index_progress, args=(c, label, stop_event))
    t.start()

    try:
        # Kick off the index build (blocking)
        name = c.create_index(spec)
    finally:
        # Stop the progress poller
        stop_event.set()
        t.join()

    sys.stdout.write(f"[✓] {i}/{total} {label} created (name: {name}).\n")
    sys.stdout.flush()

print("All indexes created.")


[✓] 1/6 pickup.time created (name: pickup.time_1).                              
[✓] 2/6 dropoff.time created (name: dropoff.time_1).                            
[✓] 3/6 pickup.location_id + pickup.time created (name: pickup.location_id_1_pickup.time_1).
[✓] 4/6 dropoff.location_id + dropoff.time created (name: dropoff.location_id_1_dropoff.time_1).
[✓] 5/6 meta.time_bucket created (name: meta.time_bucket_1).                    
[✓] 6/6 pickup.loc (geospatial) created (name: pickup.loc_2dsphere).            
All indexes created.
/ Creating: pickup.location_id + pickup.time

In [10]:
from pymongo import MongoClient
from pprint import pprint

client = MongoClient("mongodb://mongouser:mongopassword@mongodb:27017/taxi_logs?authSource=admin")
c = client.taxi_logs.trips

expected = {
    "pickup.time_1": [("pickup.time", 1)],
    "dropoff.time_1": [("dropoff.time", 1)],
    "pickup.location_id_1_pickup.time_1": [("pickup.location_id", 1), ("pickup.time", 1)],
    "dropoff.location_id_1_dropoff.time_1": [("dropoff.location_id", 1), ("dropoff.time", 1)],
    "meta.time_bucket_1": [("meta.time_bucket", 1)],
    "pickup.loc_2dsphere": [("pickup.loc", "2dsphere")],
}

present = {ix["name"]: list(ix["key"].items()) for ix in c.list_indexes()}

print("== structure ==")
missing = [name for name in expected if name not in present]
mismatch = [name for name in expected if name in present and present[name] != expected[name]]
print("missing:", missing or "None")
print("mismatch (spec differs):", mismatch or "None")

def explain_with_fallback(coll, filt, proj, hint_spec):
    """
    Try cursor.explain() with verbosity (PyMongo >=4),
    then without args (PyMongo 3.x),
    then fall back to db.command('explain', ...).
    """
    cur = coll.find(filt, proj).hint(hint_spec).limit(0)
    try:
        return cur.explain(verbosity="queryPlanner")
    except TypeError:
        # Older PyMongo — no verbosity kwarg
        try:
            return cur.explain()
        except Exception:
            pass
    except Exception:
        pass

    # Last resort: direct explain command (works across versions)
    try:
        return coll.database.command(
            "explain",
            {"find": coll.name, "filter": filt, "projection": proj, "hint": hint_spec},
            verbosity="queryPlanner"
        )
    except TypeError:
        # Some drivers want a single dict
        return coll.database.command({
            "explain": {"find": coll.name, "filter": filt, "projection": proj, "hint": hint_spec},
            "verbosity": "queryPlanner"
        })

print("\n== hint usability ==")
for name, spec in expected.items():
    try:
        plan = explain_with_fallback(c, {}, {"_id": 0}, spec)
        # If we got a dict back, the hint was accepted and a plan was produced
        if isinstance(plan, dict):
            print(f"[OK] {name} hint accepted")
        else:
            print(f"[FAIL] {name} plan not produced (unexpected response)")
    except Exception as e:
        print(f"[FAIL] {name} hint rejected: {e}")

# Geo sanity with modern pipeline (no 'limit' inside $geoNear)
if "pickup.loc_2dsphere" in present:
    print("\n== geo sanity ==")
    try:
        pipeline = [
            {"$geoNear": {
                "near": {"type": "Point", "coordinates": [-73.9855, 40.7580]},
                "distanceField": "d",
                "spherical": True
            }},
            {"$limit": 1}
        ]
        list(c.aggregate(pipeline))
        print("[OK] $geoNear ran (index usable).")
    except Exception as e:
        print("[FAIL] $geoNear failed:", e)

print("\nDone.")


== structure ==
missing: None
mismatch (spec differs): None

== hint usability ==


KeyboardInterrupt: 

In [13]:
# Step 7 — Query & Validate

from datetime import timedelta
from pprint import pprint

# ----- filters/projection -----
jan1 = datetime(2019, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
day2 = jan1 + timedelta(days=1)
flt  = {"pickup.location_id": 132, "pickup.time": {"$gte": jan1, "$lt": day2}}
proj = {"_id": 0}

# ----- always-works EXPLAIN (bypasses Cursor.explain API quirks) -----
try:
    explain = c.database.command(
        "explain",
        {"find": c.name, "filter": flt, "projection": proj},
        verbosity="executionStats"   # keyword; works on modern MongoDB
    )
except TypeError:
    # Some driver combos want a single dict payload
    explain = c.database.command({
        "explain": {"find": c.name, "filter": flt, "projection": proj},
        "verbosity": "executionStats"
    })

# ----- summarize -----
def summarize_explain(exp):
    qp = exp.get("queryPlanner", {})
    winning = qp.get("winningPlan", {})
    execstats = exp.get("executionStats", {})

    def find_stages(node, hits=None):
        if hits is None: hits = []
        if isinstance(node, dict):
            if node.get("stage") in ("IXSCAN", "FETCH"):
                hits.append(node["stage"])
            for k in ("inputStage","inputStages","outerStage","innerStage","leftChild","rightChild","shards","winningPlan"):
                if k in node:
                    child = node[k]
                    if isinstance(child, list):
                        for ch in child: find_stages(ch, hits)
                    else:
                        find_stages(child, hits)
        elif isinstance(node, list):
            for ch in node: find_stages(ch, hits)
        return hits

    stages = find_stages(winning)
    print("Winning stage:", winning.get("stage"))
    print("Stages seen:", stages)
    print("nReturned:", execstats.get("nReturned"))
    print("totalKeysExamined:", execstats.get("totalKeysExamined"))
    print("totalDocsExamined:", execstats.get("totalDocsExamined"))
    if "IXSCAN" in stages:
        print("✅ Index scan detected (good).")
    else:
        print("⚠️ No IXSCAN detected—query may not be using your index.")

summarize_explain(explain)


Winning stage: PROJECTION_DEFAULT
Stages seen: ['FETCH', 'IXSCAN']
nReturned: 7477
totalKeysExamined: 7477
totalDocsExamined: 7477
✅ Index scan detected (good).


In [14]:
from datetime import datetime, timezone
jan1 = datetime(2019,1,1,tzinfo=timezone.utc)
feb1 = datetime(2019,2,1,tzinfo=timezone.utc)
mongo_jan = c.count_documents({"pickup.time": {"$gte": jan1, "$lt": feb1}})
print("Mongo Jan 2019:", mongo_jan)


Mongo Jan 2019: 7696390


In [15]:
from datetime import datetime, timezone
from pprint import pprint

jan1 = datetime(2019,1,1,tzinfo=timezone.utc)
feb1 = datetime(2019,2,1,tzinfo=timezone.utc)

# Match Jan-2019 by partition metadata, supporting both int and string storage
jan_meta = {"$or": [
    {"meta.year": 2019,  "meta.month": 1},
    {"meta.year": "2019","meta.month": "01"}
]}

# 1) Total docs tagged as Jan by metadata (should ≈ Hive count)
total_meta = c.count_documents(jan_meta)
print("Mongo (by meta Jan 2019):", total_meta)

# 2) Type breakdown of pickup.time within those Jan docs
pipeline = [
    {"$match": jan_meta},
    {"$project": {
        "_id": 0,
        "t_type": {"$type": "$pickup.time"},
        "in_range": {"$and": [
            {"$gte": ["$pickup.time", jan1]},
            {"$lt":  ["$pickup.time", feb1]}
        ]}
    }},
    {"$group": {
        "_id": "$t_type",                         # e.g. "date", "missing", "string"
        "total": {"$sum": 1},
        "in_range": {"$sum": {"$cond": ["$in_range", 1, 0]}}
    }}
]
buckets = list(c.aggregate(pipeline))
pprint(buckets)

# 3) Compute the reconciliation figures
totals = {d["_id"]: d for d in buckets}  # keyed by type name
date_total = totals.get("date", {}).get("total", 0)
date_in_range = totals.get("date", {}).get("in_range", 0)
date_out_of_range = date_total - date_in_range
missing = totals.get("missing", {}).get("total", 0)

# Everything that's NOT a proper BSON date gets excluded by your time filter.
non_date = sum(d["total"] for k,d in totals.items() if k not in ("date",))

print("date_out_of_range:", date_out_of_range)
print("missing_time:", missing)
print("non_date_time:", non_date)
print("Sum of excluded:", date_out_of_range + non_date)

# Sanity: time-range count you already ran
in_range_count = c.count_documents({"pickup.time": {"$gte": jan1, "$lt": feb1}})
print("Mongo (by time range Jan 2019):", in_range_count)


Mongo (by meta Jan 2019): 0
[]
date_out_of_range: 0
missing_time: 0
non_date_time: 0
Sum of excluded: 0
Mongo (by time range Jan 2019): 7696390


In [16]:
for path in ["meta.year","meta.month","year","month","meta.time_bucket","pickup.time"]:
    print(path, c.count_documents({path: {"$exists": True}}))

print("Sample doc keys:", list(c.find_one({}, {"_id":0}).keys()))


meta.year 0
meta.month 0
year 0
month 0
meta.time_bucket 84598444


KeyboardInterrupt: 

In [17]:
# Count Jan 2019 in NYC local time (to mirror Hive’s notion of months)
jan_local = list(c.aggregate([
  {"$match": {"pickup.time": {"$type": "date"}}},
  {"$project": {"p": {"$dateToParts": {"date":"$pickup.time", "timezone":"America/New_York"}}}},
  {"$match": {"p.year": 2019, "p.month": 1}},
  {"$count": "n"}
]))
print("Mongo (NYC local month Jan 2019):", jan_local[0]["n"] if jan_local else 0)


KeyboardInterrupt: 