In [6]:
import sys, psutil, os
print(sys.executable)
print("psutil ok:", psutil.__version__)
print("CWD:", os.getcwd())


/Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog/.venv/bin/python
psutil ok: 7.0.0
CWD: /Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog/notebooks


In [8]:
from pathlib import Path
import os

print("CWD:", os.getcwd())
print("data dir exists?  ", Path("data").resolve(), Path("data").exists())
print("sample_books.json:", Path("data/sample_books.json").resolve(), Path("data/sample_books.json").exists())
print("sample_data.json: ", Path("data/sample_data.json").resolve(), Path("data/sample_data.json").exists())


CWD: /Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog/notebooks
data dir exists?   /Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog/notebooks/data False
sample_books.json: /Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog/notebooks/data/sample_books.json False
sample_data.json:  /Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog/notebooks/data/sample_data.json False


In [9]:
from pathlib import Path
import os

# If we're inside .../notebooks, go up to the repo root
if Path.cwd().name == "notebooks":
    os.chdir("..")

print("CWD →", Path.cwd())            # should end with mini-NoSQL-Library_Catalog
print("data exists:", Path("data").exists())


CWD → /Users/Marc/Documents/GitHub/mini-NoSQL-Library_Catalog
data exists: True


In [12]:
# ── Diagnose + Reseed from data/sample_data.json (fallback to data/sample_books.json) ──
from pymongo import MongoClient
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path
import json, os, re

load_dotenv()
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017")
DB_NAME   = os.getenv("DB_NAME", "library_db")
COLL_NAME = os.getenv("COLLECTION", "books")

def parse_date_maybe(v):
    if v in (None, "", "null"): return None
    if isinstance(v, datetime): return v
    s = str(v).strip()
    for fmt in ("%Y-%m-%d","%d/%m/%Y","%m/%d/%Y","%Y/%m/%d","%Y.%m.%d","%d-%m-%Y","%Y"):
        try:
            if fmt == "%Y": return datetime(int(s), 1, 1)
            return datetime.strptime(s, fmt)
        except Exception: pass
    try:
        return datetime.fromisoformat(s.replace("Z","+00:00")).replace(tzinfo=None)
    except Exception:
        return None

def split_author(full):
    if not full: return ("","")
    parts = re.split(r"\s+", str(full).strip())
    return (parts[0], " ".join(parts[1:]) if len(parts) > 1 else "")

def adapt_schema(rec):
    # Try various common source keys → our schema
    def g(*keys, default=None):
        for k in keys:
            if k in rec and rec[k] not in (None, ""): return rec[k]
        return default
    title = g("title", "book_title", "name", default="")
    country = g("country", "origin_country", "author_country")
    af = g("author_first_name", "first_name")
    al = g("author_last_name", "last_name", "surname", "family_name")
    if not (af or al):
        full = g("author_full_name", "author", "authors", "writer")
        af, al = split_author(full)
    pub = parse_date_maybe(g("published_date", "publication_date", "pub_date", "year"))
    oop = parse_date_maybe(g("out_of_print_date", "oop_date", "end_of_print"))
    return {
        "title": title,
        "author_first_name": af or "",
        "author_last_name":  al or "",
        "published_date": pub,
        "out_of_print_date": oop,
        "country": country or None,
        "_raw": rec
    }

# pick file
seed_candidates = [Path("data/sample_data.json"), Path("data/sample_books.json")]
SEED_PATH = next((p for p in seed_candidates if p.exists()), None)
print("DB:", DB_NAME, "Collection:", COLL_NAME)
print("Seed file:", SEED_PATH if SEED_PATH else "NOT FOUND")

client = MongoClient(MONGO_URI)
col = client[DB_NAME][COLL_NAME]

# show current count
print("Current doc count:", col.estimated_document_count())

if not SEED_PATH:
    raise SystemExit("No seed file found. Put your data in data/sample_data.json or data/sample_books.json")

# load JSON/JSONL
raw = SEED_PATH.read_text(encoding="utf-8").strip()
items = json.loads(raw) if raw.startswith("[") else [json.loads(l) for l in raw.splitlines() if l.strip()]
print("Loaded from file:", len(items), "items (before adaptation)")

# adapt to our schema
docs = [adapt_schema(r) for r in items]
print("Adapted docs:", len(docs))
print("Doc keys sample:", sorted(docs[0].keys()) if docs else "N/A")

# drop + insert
col.drop()
res = col.insert_many(docs, ordered=False)
print("Inserted:", len(res.inserted_ids))
print("New doc count:", col.estimated_document_count())

# show a quick preview
print("One doc preview:", col.find_one({}, {"_id":0}))


DB: library_db Collection: books
Seed file: data/sample_books.json
Current doc count: 7
Loaded from file: 7 items (before adaptation)
Adapted docs: 7
Doc keys sample: ['_raw', 'author_first_name', 'author_last_name', 'country', 'out_of_print_date', 'published_date', 'title']
Inserted: 7
New doc count: 7
One doc preview: {'title': 'The Hobbit', 'author_first_name': 'J.R.R.', 'author_last_name': 'Tolkien', 'published_date': datetime.datetime(1937, 9, 21, 0, 0), 'out_of_print_date': None, 'country': 'UK', '_raw': {'title': 'The Hobbit', 'author_first_name': 'J.R.R.', 'author_last_name': 'Tolkien', 'country': 'UK', 'published_date': '1937-09-21', 'out_of_print_date': None}}


In [13]:
# Library Catalog — All tasks, rich output (auto-import if empty)

from pymongo import MongoClient, UpdateOne
from dotenv import load_dotenv
from datetime import datetime
from collections import Counter
from pathlib import Path
import os, json
import pandas as pd
from IPython.display import display, HTML
from pprint import pprint

# ---------------- Config ----------------
load_dotenv()
MONGO_URI   = os.getenv("MONGO_URI", "mongodb://localhost:27017")
DB_NAME     = os.getenv("DB_NAME", "library_db")
COLL_NAME   = os.getenv("COLLECTION", "books")
DATE_FIELDS = ["published_date", "out_of_print_date"]
SEED_PATH   = Path("data/sample_books.json")   # auto-import if empty
# ----------------------------------------

def h(title): display(HTML(f"<h3 style='margin-top:1.2em'>{title}</h3>"))

def get_col():
    client = MongoClient(MONGO_URI)
    return client[DB_NAME][COLL_NAME]

def parse_date_maybe(v):
    if v is None or isinstance(v, datetime): return v
    s = str(v).strip()
    if not s: return None
    fmts = ["%Y-%m-%d","%d/%m/%Y","%m/%d/%Y","%Y/%m/%d","%Y.%m.%d","%d-%m-%Y","%Y"]
    for fmt in fmts:
        try:
            if fmt == "%Y": return datetime(int(s),1,1)
            return datetime.strptime(s, fmt)
        except Exception:
            pass
    try:
        return datetime.fromisoformat(s.replace("Z","+00:00")).replace(tzinfo=None)
    except Exception:
        return None

def type_counts(col, field):
    return {
        "date"   : col.count_documents({field: {"$type": "date"}}),
        "string" : col.count_documents({field: {"$type": "string"}}),
        "null"   : col.count_documents({field: None}),
        "missing": col.count_documents({field: {"$exists": False}})
    }

# Connect
col = get_col()

# Auto-import seed if empty
if col.estimated_document_count() == 0 and SEED_PATH.exists():
    raw = SEED_PATH.read_text(encoding="utf-8").strip()
    docs = json.loads(raw) if raw.startswith("[") else [json.loads(l) for l in raw.splitlines() if l.strip()]
    if docs:
        col.insert_many(docs, ordered=False)
        print(f"Seeded {len(docs)} docs into {DB_NAME}.{COLL_NAME}")

h("1) Remove books with empty/null author_last_name (preview then delete)")
flt = {"$or":[{"author_last_name": ""}, {"author_last_name": None}]}
n_del = col.count_documents(flt)
print(f"Candidates to delete: {n_del}")
if n_del:
    sample = list(col.find(flt, {"_id":0,"title":1,"author_first_name":1,"author_last_name":1}).limit(5))
    print("Sample to be deleted (up to 5):")
    for d in sample: pprint(d)
    res = col.delete_many(flt)
    print(f"Deleted: {res.deleted_count}")
else:
    print("Nothing to delete.")

h("2) Unique author first names")
firsts = sorted({v for v in col.distinct("author_first_name") if v not in (None,"")}, key=str.lower)
print(f"Unique author first names: {len(firsts)}")
display(pd.DataFrame({"author_first_name": firsts}))

h("3) Convert date strings → datetime (before/after)")
before = {f: type_counts(col, f) for f in DATE_FIELDS}
display(pd.DataFrame(before).T)

ops, updated = [], 0
for f in DATE_FIELDS:
    for d in col.find({f: {"$type":"string"}}, {"_id":1, f:1}):
        new_val = parse_date_maybe(d.get(f))
        ops.append(UpdateOne({"_id": d["_id"]}, {"$set": {f: new_val}}))
        if len(ops) >= 500:
            updated += col.bulk_write(ops, ordered=False).modified_count
            ops.clear()
if ops:
    updated += col.bulk_write(ops, ordered=False).modified_count

after = {f: type_counts(col, f) for f in DATE_FIELDS}
print(f"Updated documents: {updated}")
display(pd.DataFrame(after).T)

h("4) 10 oldest books (by published_date)")
oldest = list(
    col.find({"published_date":{"$type":"date"}},
             {"_id":0,"title":1,"author_first_name":1,"author_last_name":1,"country":1,"published_date":1})
      .sort("published_date", 1).limit(10)
)
df_oldest = pd.DataFrame(oldest)
if not df_oldest.empty:
    df_oldest["published_date"] = pd.to_datetime(df_oldest["published_date"])
display(df_oldest)

h("5) Count by author country — Python")
from collections import Counter
cnt = Counter((d.get("country") or "Unknown") for d in col.find({}, {"country":1}))
df_py = pd.DataFrame(sorted(cnt.items(), key=lambda x: (-x[1], x[0])), columns=["country","count"])
display(df_py)

h("6) Count by author country — Aggregation pipeline")
pipe = [
    {"$group": {"_id": {"$ifNull": ["$country", "Unknown"]}, "count": {"$sum": 1}}},
    {"$sort": {"count": -1, "_id": 1}},
]
df_agg = pd.DataFrame(list(col.aggregate(pipe))).rename(columns={"_id":"country"})
display(df_agg)

# Verify both methods match
if not df_py.empty and not df_agg.empty:
    merged = df_py.merge(df_agg, on="country", suffixes=("_py","_agg"), how="outer").fillna(0)
    merged["match"] = merged["count_py"].astype(int) == merged["count_agg"].astype(int)
    h("Match check: Python vs Aggregation")
    display(merged.sort_values(["match","country"], ascending=[True, True]))

print("Done ✅")


Candidates to delete: 2
Sample to be deleted (up to 5):
{'author_first_name': 'Ada', 'author_last_name': '', 'title': 'Untitled Draft'}
{'author_first_name': 'Test', 'author_last_name': '', 'title': 'Placeholder'}
Deleted: 2


Unique author first names: 5


Unnamed: 0,author_first_name
0,Antoine
1,Chinua
2,Franz
3,Gabriel
4,J.R.R.


Unnamed: 0,date,string,null,missing
published_date,5,0,0,0
out_of_print_date,0,0,5,0


Updated documents: 0


Unnamed: 0,date,string,null,missing
published_date,5,0,0,0
out_of_print_date,0,0,5,0


Unnamed: 0,title,author_first_name,author_last_name,published_date,country
0,The Trial,Franz,Kafka,1925-04-26,Czechia
1,The Hobbit,J.R.R.,Tolkien,1937-09-21,UK
2,The Little Prince,Antoine,de Saint-Exupéry,1943-04-06,France
3,Things Fall Apart,Chinua,Achebe,1958-06-17,Nigeria
4,One Hundred Years of Solitude,Gabriel,García Márquez,1967-05-30,Colombia


Unnamed: 0,country,count
0,Colombia,1
1,Czechia,1
2,France,1
3,Nigeria,1
4,UK,1


Unnamed: 0,country,count
0,Colombia,1
1,Czechia,1
2,France,1
3,Nigeria,1
4,UK,1


Unnamed: 0,country,count_py,count_agg,match
0,Colombia,1,1,True
1,Czechia,1,1,True
2,France,1,1,True
3,Nigeria,1,1,True
4,UK,1,1,True


Done ✅
