In [1]:

import os, re, html, math, random, time
import numpy as np, pymongo, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

In [2]:
for k in ("CUDA_VISIBLE_DEVICES", "CUDA_DEVICE_ORDER"):
    os.environ.pop(k, None)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", torch.cuda.get_device_name(0) if DEVICE=="cuda" else "CPU")
if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

Using device: NVIDIA GeForce RTX 3060


In [16]:
MONGO_URI = os.getenv(
    "MONGO_URI",
    "mongodb+srv://admin:adminpassword@assignment3.dhfn7vh.mongodb.net/?retryWrites=true&w=majority&appName=Assignment3"
)
DB_NAME   = "Assignment3"
COLL_NAME = "static_reviews"

MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"   # outputs 1–5★
BATCH_SZ   = 256 if DEVICE=="cuda" else 32
MAX_LEN    = 128

In [4]:
def clean_text(s):
    if not s: return ""
    s = html.unescape(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def load_mongo(limit=None, sample_every=None):
    client = pymongo.MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000, socketTimeoutMS=120000)
    coll = client[DB_NAME][COLL_NAME]

    q = {"rating": {"$exists": True}}
    proj = {"text": 1, "title": 1, "rating": 1, "asin": 1, "parent_asin": 1, "helpful_vote": 1, "verified_purchase": 1}

    cursor = coll.find(q, proj, batch_size=5000)
    if limit:
        cursor = cursor.limit(int(limit))

    X, y, groups, weights = [], [], [], []
    i = 0
    try:
        for d in cursor:
            i += 1
            if sample_every and (i % int(sample_every) != 0):
                continue

            # 1) title + text with delimiter
            title = d.get("title") or ""
            text  = d.get("text") or ""
            if not isinstance(title, str): title = str(title) if title is not None else ""
            if not isinstance(text, str):  text  = str(text)  if text  is not None else ""
            combined = " [SEP] ".join(t for t in (clean_text(title), clean_text(text)) if t).strip()

            # 2) skip degenerate rows
            if len(combined) < 3:
                continue

            # 3) rating clamp
            try:
                r = float(d["rating"])
            except Exception:
                continue
            r = max(1.0, min(5.0, r))

            # 4) group by asin (fallback to parent_asin or small buckets)
            asin = d.get("asin") or d.get("parent_asin")
            if not asin:
                asin = f"_nogroup_{(len(X))//100}"

            # 5) optional weights
            hv = d.get("helpful_vote", 0) or 0
            vp = 1.2 if d.get("verified_purchase") else 1.0
            w = vp * (1.0 + min(float(hv), 50.0) / 50.0)  # cap helpfulness to avoid extremes

            X.append(combined)
            y.append(r)
            groups.append(str(asin))
            weights.append(w)
    finally:
        try: cursor.close()
        except: pass

    return X, np.array(y, dtype=np.float32), np.array(groups), np.array(weights, dtype=np.float32)


In [5]:
import pandas as pd
import numpy as np

def eval_metrics(y_true, y_pred, detailed=False):

    y_true = np.array(y_true, dtype=np.float32)
    y_pred = np.clip(np.array(y_pred, dtype=np.float32), 1.0, 5.0)

    err = np.abs(y_true - y_pred)

    mae  = float(np.mean(err))
    rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
    within_05 = float(np.mean(err <= 0.5))
    within_10 = float(np.mean(err <= 1.0))

    # R²
    ss_res = float(np.sum((y_true - y_pred) ** 2))
    ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2))
    r2 = 1 - ss_res / ss_tot if ss_tot > 0 else float("nan")

    results = {
        "MAE": mae,
        "RMSE": rmse,
        "Within_0.5": within_05,
        "Within_1.0": within_10,
        "R2": r2,
    }

    if detailed:
        df = pd.DataFrame({"true": y_true, "err": err})
        results["PerStarMAE"] = (
            df.groupby(np.rint(df["true"]).astype(int))["err"].mean().to_dict()
        )

    return results


In [6]:
def stars_tensor_from_id2label(model, device):

    id2label = getattr(model.config, "id2label", {})
    stars = []
    for i in range(model.config.num_labels):
        lbl = str(id2label.get(i, f"LABEL_{i}")).lower().strip()
        # match "1 star" or "5 stars"
        m = re.match(r"^\s*(\d+)\s*stars?$", lbl)
        if m:
            stars.append(int(m.group(1)))
        else:
            # fallback for "LABEL_0" → 1, etc.
            m2 = re.match(r"^\s*label[_\s-]?(\d+)\s*$", lbl)
            stars.append(int(m2.group(1)) + 1 if m2 else (i + 1))

    # sanity check
    if len(set(stars)) != model.config.num_labels:
        raise ValueError(f"Unexpected star mapping: {id2label} -> {stars}")

    stars_tensor = torch.tensor(stars, dtype=torch.float32, device=device)
    print("id2label mapping:", id2label)
    print("Stars tensor:", stars_tensor.tolist())
    return stars_tensor

# Load model + tokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE).eval()

# Use FP16 on GPU for speed
if DEVICE == "cuda":
    model = model.half()

KS = stars_tensor_from_id2label(model, DEVICE)


id2label mapping: {0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}
Stars tensor: [1.0, 2.0, 3.0, 4.0, 5.0]


In [None]:
def predict_expected_stars(texts, batch_size=BATCH_SZ, max_length=MAX_LEN, device=DEVICE):
    preds = []
    model.eval()  # make sure we're in eval mode

    # Inference context
    ctx = torch.inference_mode()
    autocast_ctx = (
        torch.cuda.amp.autocast(dtype=torch.float16)
        if device.startswith("cuda")
        else torch.no_grad()
    )

    with ctx:
        with autocast_ctx:
            for i in tqdm(
                range(0, len(texts), batch_size),
                total=(len(texts) + batch_size - 1) // batch_size,
                desc="Infer",
            ):
                chunk = texts[i:i + batch_size]

                # Defensive: coerce to non-empty strings
                chunk = [str(t) if isinstance(t, str) else "." for t in chunk]
                chunk = [c if c.strip() else "." for c in chunk]

                enc = tok(
                    chunk,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=max_length,
                )
                enc = {k: v.to(device) for k, v in enc.items()}

                logits = model(**enc).logits.float() 
                probs = torch.softmax(logits, dim=-1)       
                exp = (probs * KS).sum(dim=1).cpu().numpy() 
                preds.extend(exp.tolist())

    return preds


In [8]:
X, y, groups, weights = load_mongo()
print(f"Loaded {len(X)} examples")
t0 = time.time()
yhat = predict_expected_stars(X)
if DEVICE == "cuda":
    torch.cuda.synchronize()
t = time.time() - t0
print(f"Total time: {t:.1f}s  |  ~{len(X)/max(t,1e-6):.1f} reviews/sec")

print("nlptown metrics:", eval_metrics(y, yhat))

# Sample outputs
print("\nExamples:")
for idx in random.sample(range(len(X)), k=min(5, len(X))):
    print(f"- true={y[idx]:.1f}  pred={yhat[idx]:.2f}  |  {X[idx][:160]}")

Loaded 100000 examples


  torch.cuda.amp.autocast(dtype=torch.float16)


Infer:   0%|          | 0/391 [00:00<?, ?it/s]

Total time: 138.3s  |  ~723.2 reviews/sec
nlptown metrics: {'MAE': 0.469960480928421, 'RMSE': 0.6844781041145325, 'Within_0.5': 0.65358, 'Within_1.0': 0.89259, 'R2': 0.7655525602169655}

Examples:
- true=5.0  pred=4.84  |  Amazing value [SEP] Fantastic sound for the money, great for everyday listening.
- true=5.0  pred=4.73  |  coolstream duo review [SEP] This adapter created a better way of listening to music from my smartphone. Made me put away the cd's. Now I enjoy services like Pan
- true=5.0  pred=4.99  |  Five Stars [SEP] Awesome Modem
- true=5.0  pred=4.59  |  Great quality! Worth the buy! [SEP] I purchased these for my son's headphones to replace the originals. Was extremely pleased with the quality of these replacem
- true=5.0  pred=4.87  |  Love these speakers [SEP] Love, love love don't let the size fool you they pack a powerful punch. suburb quality


In [12]:
import joblib, os

SAVE_DIR = "nlptown_model"
os.makedirs(SAVE_DIR, exist_ok=True)

# 1. Save Hugging Face model + tokenizer (reloadable with from_pretrained)
model.save_pretrained(SAVE_DIR)
tok.save_pretrained(SAVE_DIR)

# 2. Save your stars mapping tensor (so you don’t recompute)
joblib.dump(KS.cpu().numpy(), os.path.join(SAVE_DIR, "stars_tensor.joblib"))

print(f"Saved model + tokenizer to {SAVE_DIR}")


Saved model + tokenizer to nlptown_model
