# LLM 8B Scoring (Probability)

This notebook implements product scoring using a local LLM (Qwen2.5-7B-Instruct) in 4-bit quantization. It outputs a probability score for being a gift.

### Cell 1 — install

In [None]:
!pip -q install -U transformers accelerate bitsandbytes sentencepiece pandas tqdm

### Cell 2 — load model (4-bit)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"  # 8B-ish, отлично для RU/EN

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb,
    device_map="auto",
)
model.eval()

### Cell 3 — prompt + scoring двух меток (без генерации)

In [None]:
import torch.nn.functional as F

SYSTEM = "You are a strict classifier. Answer with ONLY one token: GIFT or NOT_GIFT."

def build_prompt(title, category="", store="", price_rub=None):
    price_txt = "" if price_rub is None else f"{price_rub}"
    user = f"""Decide if this product is a good gift item for most people.
If it is mostly a utilitarian supply/chemical/spare part/consumable -> NOT_GIFT.
If it is a presentable gift item (decor, gadgets, jewelry, toys, hobby items) -> GIFT.

Product:
- title: {title}
- category: {category}
- store: {store}
- price_rub: {price_txt}
Answer:"""
    # Qwen chat template
    msgs = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": user},
    ]
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

@torch.no_grad()
def score_label(prompt: str, label: str) -> float:
    # Score logprob of label completion tokens given prompt
    full = prompt + label
    enc_full = tok(full, return_tensors="pt").to(model.device)
    enc_prompt = tok(prompt, return_tensors="pt").to(model.device)

    input_ids = enc_full["input_ids"]
    # positions that correspond to label tokens
    prompt_len = enc_prompt["input_ids"].shape[1]
    logits = model(**enc_full).logits  # [B, T, V]

    # logprobs for tokens t at positions >= prompt_len
    # token at position i is predicted by logits at i-1
    label_ids = input_ids[:, prompt_len:]
    start = prompt_len - 1
    lp = 0.0
    for j in range(label_ids.shape[1]):
        token_id = label_ids[0, j].item()
        logp = F.log_softmax(logits[0, start + j, :], dim=-1)[token_id].item()
        lp += logp
    return lp

@torch.no_grad()
def giftability_llm(title, category="", store="", price_rub=None):
    prompt = build_prompt(title, category, store, price_rub)
    s_gift = score_label(prompt, " GIFT")
    s_not  = score_label(prompt, " NOT_GIFT")
    # convert to probability-like score
    p_gift = float(torch.softmax(torch.tensor([s_not, s_gift]), dim=0)[1].item())
    return p_gift

### Cell 4 — Load Data

Reading from CSV with `;` separator.

In [None]:
import pandas as pd

# Update path to your csv file
df = pd.read_csv('products.csv', sep=';')
df.head(3)

In [None]:
import numpy as np
import re

# --- 3) Normalize columns (Russian -> internal) ---
COLMAP = {
    "Название товара": "title",
    "Артикул": "sku",
    "Цена товара": "price",
    "Категория товара": "category",
    "Изображение": "image_url",
    "Название магазина": "store_name",
    "Название маркетплейса": "marketplace_name",
    "Рейтинг": "rating",
    "Партнерская ссылка": "partner_url",
    "Модель оплаты": "pay_model",
    "Комиссия": "commission",
    "Потенциальная комиссия": "potential_commission",
    "Цена клика": "cpc",
    "Юридические данные продавца": "legal_text",
}

df = df.rename(columns={k: v for k, v in COLMAP.items() if k in df.columns})

required = ["title", "sku", "price", "category", "image_url", "partner_url"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns after rename: {missing}")

def to_float_safe(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return float(x)
    s = str(x).replace(" ", "").replace(",", ".")
    s = re.sub(r"[^0-9.]", "", s)
    try:
        return float(s) if s else np.nan
    except:
        return np.nan

df["price"] = df["price"].apply(to_float_safe)
if "rating" in df.columns:
    df["rating"] = df["rating"].apply(to_float_safe)

# Keep essentials
df = df.dropna(subset=["title", "sku", "category", "image_url", "partner_url"]).reset_index(drop=True)
print("Clean rows:", len(df))
df.head(3)

### Cell 5 — прогон на сэмпле (например 200 строк)

In [None]:
from tqdm import tqdm

df_sample = df.sample(min(200, len(df)), random_state=0).copy()

scores = []
for _, r in tqdm(df_sample.iterrows(), total=len(df_sample)):
    p_gift = giftability_llm(
        title=r.get("title",""),
        category=r.get("category",""),
        store=r.get("store_name",""),
        price_rub=r.get("price", None), # 'price' is the normalized column
    )
    scores.append(p_gift)

df_sample["llm_gift_score"] = scores

df_sample[["title","category","llm_gift_score"]].head(10)