# LLM 8B Scoring with Reasoning (CoT)

This notebook implements product scoring using a local LLM (Qwen2.5-7B-Instruct). It first generates brief reasoning and then calculates a probability score for being a gift based on that reasoning.

### Cell 1 — install

In [None]:
!pip -q install -U transformers accelerate bitsandbytes sentencepiece pandas tqdm pyarrow matplotlib seaborn

### Cell 2 — load model (4-bit)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"  # 8B-ish

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb,
    device_map="auto",
)
model.eval()

### Cell 3 — scoring + reasoning logic

In [None]:
import torch.nn.functional as F

SYSTEM = """You are a strict giftability classifier.
First, provide a very brief reasoning in Russian or English (max 2 sentences).
Then conclude with 'Answer: GIFT' or 'Answer: NOT_GIFT'."""

def build_prompt(title, category="", store="", price_rub=None):
    price_txt = "" if price_rub is None else f"{price_rub}"
    user = f"""Decide if this product is a good gift item for most people.
If it is mostly a utilitarian supply/chemical/spare part/consumable -> NOT_GIFT.
If it is a presentable gift item (decor, gadgets, jewelry, toys, hobby items) -> GIFT.

Product:
- title: {title}
- category: {category}
- store: {store}
- price_rub: {price_txt}
Reasoning:"""
    msgs = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": user},
    ]
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

@torch.no_grad()
def score_label_after_reasoning(prompt_with_reasoning: str, label: str) -> float:
    # Note: label should include the space if needed, e.g. " GIFT"
    full = prompt_with_reasoning + label
    enc_full = tok(full, return_tensors="pt").to(model.device)
    enc_prompt = tok(prompt_with_reasoning, return_tensors="pt").to(model.device)
    
    input_ids = enc_full["input_ids"]
    prompt_len = enc_prompt["input_ids"].shape[1]
    logits = model(**enc_full).logits
    
    label_ids = input_ids[:, prompt_len:]
    start = prompt_len - 1
    lp = 0.0
    for j in range(label_ids.shape[1]):
        token_id = label_ids[0, j].item()
        logp = F.log_softmax(logits[0, start + j, :], dim=-1)[token_id].item()
        lp += logp
    return lp

@torch.no_grad()
def giftability_with_reasoning(title, category="", store="", price_rub=None):
    initial_prompt = build_prompt(title, category, store, price_rub)
    
    # 1. Generate reasoning until "Answer:"
    inputs = tok(initial_prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(
        **inputs, 
        max_new_tokens=60, 
        do_sample=False, 
        pad_token_id=tok.eos_token_id,
        stopping_criteria=None # We'll just split by "Answer:" later
    )
    
    full_text = tok.decode(output_ids[0], skip_special_tokens=True)
    # Qwen suele repetir el prompt en generate dependendiendo de la config, 
    # pero skip_special_tokens + chat template suele devolver todo.
    # Usamos tok.decode(output_ids[0][len(inputs.input_ids[0]):]) para solo lo nuevo
    generated_only = tok.decode(output_ids[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
    
    # Split by "Answer:"
    if "Answer:" in generated_only:
        reasoning, _ = generated_only.split("Answer:", 1)
    else:
        reasoning = generated_only
    
    reasoning = reasoning.strip()
    
    # 2. Score based on prompt + reasoning + "Answer:"
    prompt_for_scoring = initial_prompt + reasoning + "\nAnswer:"
    
    s_gift = score_label_after_reasoning(prompt_for_scoring, " GIFT")
    s_not  = score_label_after_reasoning(prompt_for_scoring, " NOT_GIFT")
    
    p_gift = float(torch.softmax(torch.tensor([s_not, s_gift]), dim=0)[1].item())
    
    # Round to 2 decimals, < 0.01 -> 0
    p_gift = round(p_gift, 2)
    if p_gift < 0.01:
        p_gift = 0.0
        
    return p_gift, reasoning

### Cell 4 — Load Data (Parquet)

In [None]:
import pandas as pd
import numpy as np
import re

PATH = "/kaggle/input/gifty-takprodam/haiku_100_enriched.parquet"
df = pd.read_parquet(PATH)
print("Rows loaded:", len(df))
df.head(3)

In [None]:
# Normalize and clean
def to_float_safe(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)): return float(x)
    s = str(x).replace(" ", "").replace(",", ".")
    s = re.sub(r"[^0-9.]", "", s)
    try: return float(s) if s else np.nan
    except: return np.nan

if "price" not in df.columns and "Цена товара" in df.columns:
    df["price"] = df["Цена товара"].apply(to_float_safe)

df = df.dropna(subset=["title", "category"]).reset_index(drop=True)
print("Clean rows:", len(df))

### Cell 5 — scoring + reasoning (sample 50 - slow due to generation)

In [None]:
from tqdm import tqdm

# Generation is slow, smaller sample by default
df_sample = df.sample(min(50, len(df)), random_state=42).copy()

scores = []
reasonings = []
for _, r in tqdm(df_sample.iterrows(), total=len(df_sample)):
    p, reason = giftability_with_reasoning(
        title=r.get("title",""),
        category=r.get("category",""),
        store=r.get("store_name",""),
        price_rub=r.get("price", None),
    )
    scores.append(p)
    reasonings.append(reason)

df_sample["llm_gift_score"] = scores
df_sample["llm_reasoning"] = reasonings

pd.set_option('display.max_colwidth', None)
df_sample[["title", "attr_is_giftable", "llm_gift_score", "llm_reasoning"]].head(10)

### Cell 6 — Comparative Analytics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Correlation
corr = df_sample[["attr_is_giftable", "llm_gift_score"]].corr().iloc[0, 1]
print(f"Correlation between Haiku baseline and Qwen2.5 (with reasoning): {corr:.4f}")

# 2. Distribution Plot
plt.figure(figsize=(10, 5))
sns.kdeplot(df_sample["attr_is_giftable"], label="Haiku baseline", fill=True)
sns.kdeplot(df_sample["llm_gift_score"], label="Qwen2.5 + Reasoning", fill=True)
plt.title("Giftability Score Distribution Comparison")
plt.legend()
plt.show()

# 3. Top disagreements
df_sample["score_diff"] = df_sample["llm_gift_score"] - df_sample["attr_is_giftable"]
print("\n--- Top 5 Disagreements (LLM > Haiku) ---")
display(df_sample.sort_values("score_diff", ascending=False)[["title", "attr_is_giftable", "llm_gift_score", "llm_reasoning"]].head(5))

print("\n--- Top 5 Disagreements (Haiku > LLM) ---")
display(df_sample.sort_values("score_diff", ascending=True)[["title", "attr_is_giftable", "llm_gift_score", "llm_reasoning"]].head(5))