# Model.ipynb — Baseline Text Classifier (Standard Library Only)

**Goal:** Train and evaluate a simple Multinomial Naive Bayes classifier using the token-ID sequences
exported by `1_Tokenizer & Embedding.ipynb`.


In [None]:
# Imports and config (standard library only) ===
import csv, math, json
from collections import defaultdict
from pathlib import Path

# Input files produced by the first notebook
TRAIN_CSV = Path("dataset/train_embeddings.csv")
TEST_CSV  = Path("dataset/test_embeddings.csv")

# <PAD> token id. Must match the first notebook.
PAD_ID = 0

# Basic checks
assert TRAIN_CSV.exists(), f"Missing {TRAIN_CSV}. Please export it from the first notebook."
assert TEST_CSV.exists(),  f"Missing {TEST_CSV}. Please export it from the first notebook."


## 1. Load sequences from CSV

In [2]:
def load_xy_from_csv(path: Path, max_len: int = 200):
    """Load a CSV where the first `max_len` columns are token ids and the last column is `label`.
    Returns: X (list[list[int]]), y (list[int])."""
    X, y = [], []
    with open(path, newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader, None)  # skip header if present
        for row in reader:
            if not row:
                continue
            try:
                ids = [int(x) for x in row[:max_len]]
                label = int(row[max_len])
            except Exception:
                # skip malformed lines
                continue
            X.append(ids)
            y.append(label)
    return X, y

X_train_seq, y_train = load_xy_from_csv(TRAIN_CSV, max_len=200)
X_test_seq,  y_test  = load_xy_from_csv(TEST_CSV,  max_len=200)

print("Train size:", len(X_train_seq), " Test size:", len(X_test_seq))
print("Example train row (first 15 ids):", X_train_seq[0][:15] if X_train_seq else [])


Train size: 1600  Test size: 400
Example train row (first 15 ids): [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


## 2. Convert sequences to sparse Bag-of-Words

In [3]:
def seq_to_counts(seq, pad_id: int = PAD_ID):
    """Convert a fixed-length token-id sequence into a sparse bag-of-words dict.
    - Skip the PAD token (pad_id).
    - Return a dict: token_id -> count."""
    bag = defaultdict(int)
    for tid in seq:
        if tid != pad_id:
            bag[tid] += 1
    return bag


## 3. Train Multinomial Naive Bayes (self-implemented)

In [4]:
def train_mnb(X_seqs, y, alpha: float = 1.0, pad_id: int = PAD_ID):
    """Train a Multinomial Naive Bayes model:
    - Estimate class priors P(c)
    - Estimate conditional probabilities P(token_id | c) with Laplace smoothing
    - Store everything in log space (log_prior, log_likelihood)

    Returns: a model dict."""
    class_count = defaultdict(int)                               # number of docs per class
    token_count_per_class = defaultdict(lambda: defaultdict(int)) # token counts per class
    vocab = set()                                                 # set of observed token ids

    # Count statistics
    for seq, label in zip(X_seqs, y):
        class_count[label] += 1
        bag = seq_to_counts(seq, pad_id=pad_id)
        for tid, c in bag.items():
            token_count_per_class[label][tid] += c
            vocab.add(tid)

    # log P(c)
    total_docs = sum(class_count.values())
    log_prior = {c: math.log(class_count[c] / total_docs) for c in class_count}

    # log P(tid | c)
    V = len(vocab)  # vocabulary size based on observed token ids
    log_likelihood = {c: {} for c in class_count}
    for c in class_count:
        total_tokens_c = sum(token_count_per_class[c].values())
        denom = total_tokens_c + alpha * V
        for tid in vocab:
            num = token_count_per_class[c].get(tid, 0) + alpha
            log_likelihood[c][tid] = math.log(num / denom)

    return {
        "log_prior": log_prior,
        "log_likelihood": log_likelihood,
        "vocab": vocab,
        "alpha": alpha,
        "pad_id": pad_id,
    }

model = train_mnb(X_train_seq, y_train, alpha=1.0, pad_id=PAD_ID)
print("Model trained. Classes:", list(model["log_prior"].keys()), "Vocab size:", len(model["vocab"]))


Model trained. Classes: [0, 1] Vocab size: 18602


## 4. Predict and evaluate

In [5]:
def predict_one_mnb(model, seq):
    """Predict the class (0/1) for a single sequence using the trained MNB model.
    Sparse scoring: only sum over tokens that actually appear in the sequence."""
    bag = seq_to_counts(seq, pad_id=model["pad_id"])
    best_c, best_score = None, None
    for c in model["log_prior"]:
        score = model["log_prior"][c]
        for tid, cnt in bag.items():
            if tid in model["vocab"]:
                score += cnt * model["log_likelihood"][c].get(tid, 0.0)
        if best_score is None or score > best_score:
            best_c, best_score = c, score
    return best_c

def accuracy(y_true, y_pred):
    correct = sum(int(a == b) for a, b in zip(y_true, y_pred))
    return correct / max(1, len(y_true))

def f1_binary(y_true, y_pred, positive=1):
    """Simple F1 for binary classification."""
    tp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == positive and yp == positive)
    fp = sum(1 for yt, yp in zip(y_true, y_pred) if yt != positive and yp == positive)
    fn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == positive and yp != positive)
    prec = tp / max(1, tp + fp)
    rec  = tp / max(1, tp + fn)
    return 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)

# Evaluate on the test set
y_pred = [predict_one_mnb(model, seq) for seq in X_test_seq]
print("Accuracy:", round(accuracy(y_test, y_pred), 4))
print("F1 (positive=1):", round(f1_binary(y_test, y_pred, positive=1), 4))


Accuracy: 0.7875
F1 (positive=1): 0.7769


## 5. Prediction helper: predict from raw token-id sequences

In [None]:
def predict_from_ids(seq_ids, max_len: int = 200):
    """
    Predict from a raw token-id sequence of arbitrary length.

    Steps:
      1. If the sequence is longer than `max_len`, it is truncated.
      2. If the sequence is shorter than `max_len`, it is padded
         with <PAD> tokens (ID defined by PAD_ID, usually 0).
      3. The fixed-length sequence is passed to `predict_one_mnb`
         for classification.

    Args:
        seq_ids (list[int]): token IDs of the sentence or text.
        max_len (int): desired fixed sequence length (default 200).

    Returns:
        int: predicted label (0 = negative, 1 = positive).
    """
    # Case 1: sequence longer than 200 → keep only first 200 tokens
    if len(seq_ids) >= max_len:
        seq = seq_ids[:max_len]
    else:
        # Case 2: sequence shorter than 200 → pad with PAD_ID (zeros)
        seq = seq_ids + [PAD_ID] * (max_len - len(seq_ids))

    # Send the processed sequence to the trained Naive Bayes model
    return predict_one_mnb(model, seq)

# Demo of predict_from_ids
print("predict_from_ids demo (0/1):", predict_from_ids(X_test_seq[0]))


predict_from_ids demo (0/1): 1


## 6. Interactive single-sentence prediction

In [9]:
# Type any raw sentence and get the predicted label.
# Requirements:
#  - The model in this notebook must already be trained (run previous cells).
#  - A tokenizer vocabulary must exist at artifacts/tokenizer_word/tokenizer.json
#    (export it once from the tokenizer notebook).

import json, re
from pathlib import Path

TOKENIZER_JSON = Path("artifacts/tokenizer_word/tokenizer.json")

def _basic_normalize(text: str) -> str:
    """Lowercase + keep letters/digits/apostrophes + collapse spaces."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def _load_tokenizer_vocab(json_path=TOKENIZER_JSON):
    if not json_path.exists():
        raise FileNotFoundError(
            f"Missing {json_path}. Please export tokenizer_word/tokenizer.json in the tokenizer notebook."
        )
    meta = json.loads(json_path.read_text())
    id2word = meta.get("id2word", [])
    word2id = {w: i for i, w in enumerate(id2word)}
    ttype = meta.get("type", "word")
    return {"type": ttype, "id2word": id2word, "word2id": word2id}

def _encode_with_vocab(text: str, vocab: dict):
    """Minimal encoder for raw text using the saved vocabulary (UNK=1)."""
    s = _basic_normalize(text)
    toks = list(s) if vocab["type"] == "char" else s.split()
    ids = [vocab["word2id"].get(t, 1) for t in toks]  # UNK=1
    return ids

def predict_one_text(text: str):
    """Raw text -> ids -> pad -> predict via the trained MNB model."""
    vocab = _load_tokenizer_vocab()
    ids = _encode_with_vocab(text, vocab)
    return predict_from_ids(ids)  # uses padding and calls predict_one_mnb(model, ...)

print("Type a sentence to classify (blank line to exit):")
while True:
    try:
        s = input("> ").strip()
    except EOFError:
        break
    if not s:
        break
    label = predict_one_text(s)
    print(f"Input: {s}")
    print("Prediction:", "positive" if label == 1 else "negative")

Type a sentence to classify (blank line to exit):
Input: i love this movieee
Prediction: positive
Input: this movie so boring
Prediction: negative
