# Gifty Vision Worker: Multimodal LLM Scoring

This worker uses **Qwen2-VL-7B-Instruct** to evaluate product giftability using both text and images. 

### Capabilities:
1. **Visual Analysis**: Sees the product's design, quality, and packaging.
2. **Text Analysis**: Understands title, category, and merchant.
3. **Logprob Scoring**: Provides a precise probability score based on multimodal context.

### Requirements:
1. Setup your `INTERNAL_API_TOKEN` in Kaggle Secrets (Add-ons -> Secrets).

In [None]:
!pip -q install -U transformers accelerate bitsandbytes qwen_vl_utils requests Pillow tqdm

In [None]:
import os
import logging
import sys
from kaggle_secrets import UserSecretsClient

API_BASE_URL = "https://api.giftyai.ru"
DEBUG = True

try:
    user_secrets = UserSecretsClient()
    INTERNAL_TOKEN = user_secrets.get_secret("INTERNAL_API_TOKEN")
except Exception:
    print("Warning: Could not fetch INTERNAL_API_TOKEN from Kaggle Secrets. Falling back to environment.")
    INTERNAL_TOKEN = os.getenv("INTERNAL_API_TOKEN", "default_internal_token")

MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
MODEL_VERSION = "v2.0-vision"
MODEL_TAG = "qwen2-vl-7b-fp16"

# Configure Logging
log_level = logging.DEBUG if DEBUG else logging.INFO
logging.basicConfig(
    level=log_level,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger("GiftyVisionWorker")

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

logger.info(f"Loading Vision Model {MODEL_ID}...")

# On 2x T4 (32GB), we can load 7B in full float16 for maximum vision precision
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)
model.eval()
logger.info("Multimodal model loaded successfully.")

In [None]:
import torch.nn.functional as F
from qwen_vl_utils import process_vision_info
from PIL import Image
import requests
from io import BytesIO

def get_image(url):
    try:
        if not url: return None
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        return img
    except Exception as e:
        logger.warning(f"Failed to load image from {url}: {e}")
        return None

def build_vision_prompt(item):
    # We construct messages for the processor
    content = []
    if item.get('image_url'):
        content.append({"type": "image", "image": item['image_url']})
    
    text_query = f"""Decide if this product is a good gift item for most people.
Analyze the image and text. 
Utilitarian/chemical/spare parts -> NOT_GIFT. 
Decor/gadgets/jewelry/toys -> GIFT.

Product:
- title: {item.get('title')}
- category: {item.get('category')}
- merchant: {item.get('merchant')}

Briefly explain your reasoning (2 sentences max), then conclude with Answer: GIFT or Answer: NOT_GIFT."""
    
    content.append({"type": "text", "text": text_query})
    
    messages = [{"role": "user", "content": content}]
    return messages

@torch.no_grad()
def score_label_multimodal(full_prompt_text, image_inputs, label: str):
    # Note: Scoring VLM with logprobs is similar to text scoring
    # We need to compute logits for the suffix label
    full_text = full_prompt_text + label
    
    # Process again with the full text + images
    inputs = processor(
        text=[full_text],
        images=image_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    # Prompt only inputs to find where label starts
    prompt_inputs = processor(
        text=[full_prompt_text],
        images=image_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    prompt_len = prompt_inputs.input_ids.shape[1]
    outputs = model(**inputs)
    logits = outputs.logits
    
    label_ids = inputs.input_ids[:, prompt_len:]
    lp = 0.0
    for j in range(label_ids.shape[1]):
        token_id = label_ids[0, j].item()
        logp = F.log_softmax(logits[0, prompt_len - 1 + j, :], dim=-1)[token_id].item()
        lp += logp
    return lp

def process_one_vision(item):
    logger.debug(f"Processing VLM item: {item.get('title')} (ID: {item.get('gift_id')})")
    
    messages = build_vision_prompt(item)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    # 1. Generate Reasoning
    generated_ids = model.generate(**inputs, max_new_tokens=100)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    gen_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    reasoning = gen_text.split("Answer:")[0].strip() if "Answer:" in gen_text else gen_text.strip()
    logger.debug(f"VLM Reasoning: {reasoning}")
    
    # 2. Score Probabilities
    scoring_prompt = text + reasoning + "\nAnswer:"
    
    s_gift = score_label_multimodal(scoring_prompt, image_inputs, " GIFT")
    s_not = score_label_multimodal(scoring_prompt, image_inputs, " NOT_GIFT")
    
    p = float(torch.softmax(torch.tensor([s_not, s_gift]), dim=0)[1].item())
    p_final = round(p, 2)
    if p_final < 0.01: p_final = 0.0
    
    logger.info(f"Result: {p_final} | {item.get('title')[:30]}...")
    return p_final, reasoning

In [None]:
import time

headers = {"X-Internal-Token": INTERNAL_TOKEN}

logger.info("!!! VISION WORKER STARTING (api.giftyai.ru) !!!")
while True:
    try:
        # 1. Get tasks
        resp = requests.get(f"{API_BASE_URL}/internal/scoring/tasks?limit=10", headers=headers, timeout=30)
        if resp.status_code != 200:
            logger.error(f"Error fetching tasks: {resp.status_code} | {resp.text}")
            time.sleep(30)
            continue
            
        tasks = resp.json()
        if not tasks:
            logger.info("No tasks found. Sleeping 5 min.")
            time.sleep(300)
            continue
            
        logger.info(f"[Batch] Processing {len(tasks)} items with vision")
        results = []
        for t in tasks:
            try:
                p, reason = process_one_vision(t)
                results.append({
                    "gift_id": t['gift_id'],
                    "llm_gift_score": p,
                    "llm_gift_reasoning": reason,
                    "llm_scoring_model": MODEL_TAG,
                    "llm_scoring_version": MODEL_VERSION
                })
            except Exception as e_item:
                logger.error(f"Item {t.get('gift_id')} failed: {e_item}")
                
        # 2. Submit results
        if results:
            s_resp = requests.post(f"{API_BASE_URL}/internal/scoring/submit", json={"results": results}, headers=headers, timeout=30)
            if s_resp.status_code == 200:
                logger.info(f"Successfully updated {s_resp.json().get('updated')} items.")
            else:
                logger.error(f"Submission failed: {s_resp.status_code}")
                
    except Exception as e_main:
        logger.error(f"Main loop error: {e_main}", exc_info=True)
        time.sleep(30)