# LLM 8B Scoring vs Haiku baseline

This notebook implements product scoring using a local LLM (Qwen2.5-7B-Instruct) and compares results with the `attr_is_giftable` baseline from Haiku.

### Cell 1 — install

In [None]:
!pip -q install -U transformers accelerate bitsandbytes sentencepiece pandas tqdm pyarrow matplotlib seaborn

### Cell 2 — load model (4-bit)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"  # 8B-ish

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb,
    device_map="auto",
)
model.eval()

### Cell 3 — scoring logic (logprobs)

In [None]:
import torch.nn.functional as F

SYSTEM = "You are a strict classifier. Answer with ONLY one token: GIFT or NOT_GIFT."

def build_prompt(title, category="", store="", price_rub=None):
    price_txt = "" if price_rub is None else f"{price_rub}"
    user = f"""Decide if this product is a good gift item for most people.
If it is mostly a utilitarian supply/chemical/spare part/consumable -> NOT_GIFT.
If it is a presentable gift item (decor, gadgets, jewelry, toys, hobby items) -> GIFT.

Product:
- title: {title}
- category: {category}
- store: {store}
- price_rub: {price_txt}
Answer:"""
    msgs = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": user},
    ]
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

@torch.no_grad()
def score_label(prompt: str, label: str) -> float:
    full = prompt + label
    enc_full = tok(full, return_tensors="pt").to(model.device)
    enc_prompt = tok(prompt, return_tensors="pt").to(model.device)
    input_ids = enc_full["input_ids"]
    prompt_len = enc_prompt["input_ids"].shape[1]
    logits = model(**enc_full).logits
    label_ids = input_ids[:, prompt_len:]
    start = prompt_len - 1
    lp = 0.0
    for j in range(label_ids.shape[1]):
        token_id = label_ids[0, j].item()
        logp = F.log_softmax(logits[0, start + j, :], dim=-1)[token_id].item()
        lp += logp
    return lp

@torch.no_grad()
def giftability_llm(title, category="", store="", price_rub=None):
    prompt = build_prompt(title, category, store, price_rub)
    s_gift = score_label(prompt, " GIFT")
    s_not  = score_label(prompt, " NOT_GIFT")
    p_gift = float(torch.softmax(torch.tensor([s_not, s_gift]), dim=0)[1].item())
    return p_gift

### Cell 4 — Load Data (Parquet)

In [None]:
import pandas as pd
import numpy as np
import re

PATH = "/kaggle/input/gifty-takprodam/haiku_100_enriched.parquet"
df = pd.read_parquet(PATH)
print("Rows loaded:", len(df))
df.head(3)

In [None]:
# Normalize and clean
def to_float_safe(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)): return float(x)
    s = str(x).replace(" ", "").replace(",", ".")
    s = re.sub(r"[^0-9.]", "", s)
    try: return float(s) if s else np.nan
    except: return np.nan

if "price" not in df.columns and "Цена товара" in df.columns:
    df["price"] = df["Цена товара"].apply(to_float_safe)

df = df.dropna(subset=["title", "category"]).reset_index(drop=True)
print("Clean rows:", len(df))

### Cell 5 — scoring (sample 200)

In [None]:
from tqdm import tqdm

df_sample = df.sample(min(200, len(df)), random_state=42).copy()

scores = []
for _, r in tqdm(df_sample.iterrows(), total=len(df_sample)):
    p = giftability_llm(
        title=r.get("title",""),
        category=r.get("category",""),
        store=r.get("store_name",""),
        price_rub=r.get("price", None),
    )
    scores.append(p)

df_sample["llm_gift_score"] = scores
df_sample[["title", "category", "attr_is_giftable", "llm_gift_score"]].head(10)

### Cell 6 — Comparative Analytics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Correlation
corr = df_sample[["attr_is_giftable", "llm_gift_score"]].corr().iloc[0, 1]
print(f"Correlation between Haiku baseline and Qwen2.5: {corr:.4f}")

# 2. Distribution Plot
plt.figure(figsize=(10, 5))
sns.kdeplot(df_sample["attr_is_giftable"], label="Haiku baseline", fill=True)
sns.kdeplot(df_sample["llm_gift_score"], label="Qwen2.5 (local)", fill=True)
plt.title("Giftability Score Distribution Comparison")
plt.legend()
plt.show()

# 3. Disagreement analysis (Difference)
df_sample["score_diff"] = df_sample["llm_gift_score"] - df_sample["attr_is_giftable"]
df_sample["abs_diff"] = df_sample["score_diff"].abs()

print("\n--- Top 10 Disagreements (LLM thinks GIFT, Haiku thinks NOT) ---")
display(df_sample.sort_values("score_diff", ascending=False)[["title", "category", "attr_is_giftable", "llm_gift_score"]].head(10))

print("\n--- Top 10 Disagreements (Haiku thinks GIFT, LLM thinks NOT) ---")
display(df_sample.sort_values("score_diff", ascending=True)[["title", "category", "attr_is_giftable", "llm_gift_score"]].head(10))