In [None]:
# NEW NOTEBOOK: load saved nlptown model and score ALL products in product_catalog

import os, re, html, numpy as np, pandas as pd, pymongo, torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ---------- paths / config ----------
LOAD_DIR = r"nlptown_model"
MONGO_URI = os.getenv(
    "MONGO_URI",
    "mongodb+srv://admin:adminpassword@assignment3.dhfn7vh.mongodb.net/?retryWrites=true&w=majority&appName=Assignment3"
)
DB_NAME = "Assignment3"
PRODUCT_COLL = "product_catalog"

BATCH_SZ = 256    # drop to 128/64 if you hit OOM
MAX_LEN  = 128

# ---------- device ----------
for k in ("CUDA_VISIBLE_DEVICES", "CUDA_DEVICE_ORDER"):
    os.environ.pop(k, None)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", torch.cuda.get_device_name(0) if DEVICE=="cuda" else "CPU")
if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

# ---------- mongo ----------
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]

# ---------- helpers ----------
def clean_text(s):
    if not s: return ""
    s = html.unescape(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def stars_tensor_from_id2label(model, device):
    id2label = getattr(model.config, "id2label", {})
    stars = []
    for i in range(model.config.num_labels):
        lbl = str(id2label.get(i, f"LABEL_{i}")).lower().strip()
        m = re.match(r"^\s*(\d+)\s*stars?$", lbl)
        if m:
            stars.append(int(m.group(1)))
        else:
            m2 = re.match(r"^\s*label[_\s-]?(\d+)\s*$", lbl)
            stars.append(int(m2.group(1)) + 1 if m2 else (i + 1))
    return torch.tensor(stars, dtype=torch.float32, device=device)

def eval_metrics(y_true, y_pred):
    y_true = np.array(y_true, dtype=np.float32)
    y_pred = np.clip(np.array(y_pred, dtype=np.float32), 1.0, 5.0)
    err = np.abs(y_true - y_pred)
    return {
        "MAE": float(err.mean()),
        "RMSE": float(np.sqrt(((y_true - y_pred)**2).mean())),
        "Within_0.5": float((err <= 0.5).mean()),
        "Within_1.0": float((err <= 1.0).mean()),
    }

# ---------- load tokenizer/model/KS from your folder ----------
tok = AutoTokenizer.from_pretrained(LOAD_DIR)
model = AutoModelForSequenceClassification.from_pretrained(LOAD_DIR).to(DEVICE).eval()
if DEVICE == "cuda":
    model = model.half()

# stars tensor: prefer joblib file if present, else rebuild
KS = None
stars_joblib = os.path.join(LOAD_DIR, "stars_tensor.joblib")
if os.path.exists(stars_joblib):
    try:
        import joblib  # may not be in this env; that's fine, we'll fall back
        KS = torch.tensor(joblib.load(stars_joblib), dtype=torch.float32, device=DEVICE)
        print("Loaded stars tensor from joblib.")
    except Exception as e:
        print(f"Could not load stars_tensor.joblib ({e}); rebuilding from id2label…")
if KS is None:
    KS = stars_tensor_from_id2label(model, DEVICE)
    print("Built stars tensor from id2label:", KS.tolist())

# ---------- prediction ----------
def predict_expected_stars(texts, batch_size=BATCH_SZ, max_length=MAX_LEN, device=DEVICE):
    preds = []
    model.eval()
    with torch.inference_mode():
        amp = torch.cuda.amp.autocast(dtype=torch.float16) if device=="cuda" else torch.no_grad()
        with amp:
            for i in tqdm(range(0, len(texts), batch_size),
                          total=(len(texts)+batch_size-1)//batch_size,
                          desc="Infer"):
                chunk = texts[i:i+batch_size]
                # coerce to valid strings
                chunk = [clean_text(c if isinstance(c, str) else str(c) if c is not None else "") or "."
                         for c in chunk]
                enc = tok(chunk, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
                enc = {k: v.to(device) for k, v in enc.items()}
                logits = model(**enc).logits.float()
                probs  = torch.softmax(logits, dim=-1)
                exp    = (probs * KS).sum(dim=1).cpu().numpy()
                preds.extend(exp.tolist())
    return preds

def product_texts_from_doc(doc, include_product_title=True):
    texts = []
    if include_product_title:
        pt = doc.get("product_title") or ""
        if isinstance(pt, str) and pt.strip():
            texts.append(clean_text(pt))
    titles = ((doc.get("product_review") or {}).get("titles") or [])
    for t in titles:
        t = clean_text(str(t)) if t else ""
        if t:
            texts.append(t)
    return texts or ["."]

def predict_product_rating(doc):
    texts = product_texts_from_doc(doc, include_product_title=True)
    preds = predict_expected_stars(texts)
    return float(np.mean(preds))

# ---------- run over the whole product_catalog ----------
def score_product_catalog(limit=None):
    proj = {"asin":1,"product_title":1,"product_review.titles":1,"rating":1,"ratings_total":1}
    cur = db[PRODUCT_COLL].find({"rating":{"$exists":True}}, proj, batch_size=256)
    if limit:
        cur = cur.limit(int(limit))
    rows = []
    for d in cur:
        try:
            true_rating = float(d.get("rating"))
        except:
            continue
        pred_rating = predict_product_rating(d)
        rows.append({
            "asin": d.get("asin"),
            "true_rating": true_rating,
            "pred_rating": float(np.clip(pred_rating, 1.0, 5.0)),
            "ratings_total": d.get("ratings_total", None),
            "n_texts": len(product_texts_from_doc(d)),
            "sample_text": (product_texts_from_doc(d)[0])[:140],
        })
    try:
        cur.close()
    except:
        pass
    return pd.DataFrame(rows)

# ---------- execute ----------
df = score_product_catalog(limit=None)   # set to a small number (e.g., 200) to test first
print(df.head())

print("\nOverall product-level metrics:")
print(eval_metrics(df["true_rating"].values, df["pred_rating"].values))

# Largest absolute errors to inspect
df["abs_err"] = (df["true_rating"] - df["pred_rating"]).abs()
print("\nTop 10 largest errors:")
print(df.sort_values("abs_err", ascending=False).head(10)[["asin","true_rating","pred_rating","abs_err","n_texts","sample_text"]])

out_path = "product_catalog_scores.csv"
df[["asin", "true_rating", "pred_rating"]].to_csv(out_path, index=False)
print(f"\nSaved {len(df)} rows to {out_path}")


Device: NVIDIA GeForce RTX 3060
Loaded stars tensor from joblib.


  amp = torch.cuda.amp.autocast(dtype=torch.float16) if device=="cuda" else torch.no_grad()


Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

Infer:   0%|          | 0/1 [00:00<?, ?it/s]

         asin  true_rating  pred_rating  ratings_total  n_texts  \
0  B0FG4MGVJP          4.6     4.534182            873        9   
1  B0947BJ67M          4.1     4.253288           3962        9   
2  B0DZDC3WW5          4.8     4.563303           2375        9   
3  B0FLXTK4HL          4.4     3.851171            418        9   
4  B0FLKNZJ1H          4.2     4.297025            250        9   

                                         sample_text  
0  HP Newly Designed 15.6'' Business Laptop(2025/...  
1  HP 14 Laptop, Intel Celeron N4020, 4 GB RAM, 6...  
2  Apple 2025 MacBook Air 13-inch Laptop with M4 ...  
3  HP 15.6" FHD Laptop Computer for Home Business...  
4  HP Student and Home Laptop with Free Microsoft...  

Overall product-level metrics:
{'MAE': 0.3308239281177521, 'RMSE': 0.39441946148872375, 'Within_0.5': 0.6923076923076923, 'Within_1.0': 1.0}

Top 10 largest errors:
          asin  true_rating  pred_rating   abs_err  n_texts  \
21  B0DYQM4BDB          4.4     3.6199

In [None]:
# Push predictions + run metrics to a separate MongoDB (for Charts)

import pymongo, numpy as np
from datetime import datetime

# --- target Charts cluster (change DB/COL names if you like) ---
CHARTS_URI = "mongodb+srv://admin:adminpassword@cluster0.42o5xip.mongodb.net/?retryWrites=true&w=majority&appName=Charts"
CHARTS_DB  = "assignment3_charts"         # <- choose any db name on the Charts cluster
PRED_COLL  = "product_predictions"         # per-product predictions
METR_COLL  = "model_run_metrics"           # one doc per run with summary metrics

# --- connect ---
charts_client = pymongo.MongoClient(CHARTS_URI)
charts_db = charts_client[CHARTS_DB]

# --- prepare prediction docs (minimal + a few handy derived fields) ---
df_save = df[["asin", "true_rating", "pred_rating"]].copy()
df_save["abs_err"]    = (df_save["true_rating"] - df_save["pred_rating"]).abs()
df_save["signed_err"] = (df_save["pred_rating"] - df_save["true_rating"])
df_save["true_bin"]   = df_save["true_rating"].round().astype(int)
df_save["pred_bin"]   = df_save["pred_rating"].round().astype(int)
df_save["run_ts"]     = datetime.utcnow()
df_save["model"]      = "nlptown/bert-base-multilingual-uncased-sentiment"

# --- upsert predictions by asin ---
ops = [
    pymongo.ReplaceOne({"asin": rec["asin"]}, rec, upsert=True)
    for rec in df_save.to_dict(orient="records")
]
res = charts_db[PRED_COLL].bulk_write(ops, ordered=False)
print(f"predictions upserted={res.upserted_count}, modified={res.modified_count}")

# --- indexes (first run only; harmless if they exist) ---
charts_db[PRED_COLL].create_index([("asin", 1)], unique=True)
charts_db[PRED_COLL].create_index([("run_ts", -1)])
charts_db[PRED_COLL].create_index([("true_bin", 1), ("pred_bin", 1)])


predictions upserted=26, modified=0
metrics saved: {'MAE': 0.3308239281177521, 'RMSE': 0.39441946148872375, 'Within_0.5': 0.6923076923076923, 'Within_1.0': 1.0}


In [12]:
m = eval_metrics(df_save["true_rating"].values, df_save["pred_rating"].values)

# convert to percentages (numeric) and use dot-safe keys
m["pct_within_0_5"] = round(m.get("Within_0.5", 0) * 100, 2)
m["pct_within_1_0"] = round(m.get("Within_1.0", 0) * 100, 2)

# (optional) drop the old dotted keys if they exist in m
m.pop("Within_0.5", None)
m.pop("Within_1.0", None)

m_doc = {
    "_ts": datetime.utcnow(),
    "model": "nlptown/bert-base-multilingual-uncased-sentiment",
    "dataset": "product_catalog",
    "count": int(len(df_save)),
    "metrics": m,
}
charts_db[METR_COLL].insert_one(m_doc)
print("metrics saved:", m_doc["metrics"])


metrics saved: {'MAE': 0.3308239281177521, 'RMSE': 0.39441946148872375, 'pct_within_0_5': 69.23, 'pct_within_1_0': 100.0}
