In [24]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import requests
from json.decoder import JSONDecodeError
import json

In [25]:
processor = BlipProcessor.from_pretrained(
    "Salesforce/blip-image-captioning-base",
    use_fast=True
)

# load weights via safetensors format
vision_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base",
    use_safetensors=True
).vision_model


vision_model.eval()
if torch.cuda.is_available():
    vision_model.cuda()

def extract_image_features(image_path):
    """Return a single [hidden_size] vector for the input image."""
    img = Image.open(image_path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = vision_model(**inputs)
    # take the first token (CLS) embedding
    return outputs.last_hidden_state[:, 0, :].tolist()[0]

In [26]:
img_path = "data/images/nm1055413_rm704041984_1977-4-2_2014.jpg"
feats = extract_image_features(img_path)
print("Feature vector length:", len(feats))

Feature vector length: 768


In [27]:
def llama_generate(prompt: str,
                   model: str = "llama3.2",
                   temperature: float = 0.8,
                   max_length: int = 40) -> str:

    url = "http://localhost:11434/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "temperature": temperature,
        "max_length": max_length,
        "stream":  True
    }
    res = requests.post(url, json=payload, stream=True)
    res.raise_for_status()

    full_text = []
    for line in res.iter_lines(decode_unicode=True):
        if not line:
            continue
        try:
            data = json.loads(line)
        except json.JSONDecodeError:
            continue
        # each JSON chunk has a "response" key
        chunk = data.get("response", "")
        if chunk:
            full_text.append(chunk)
        # stop once the model signals it's done
        if data.get("done", False):
            break

    return "".join(full_text).strip()


In [28]:
print(llama_generate("Hello, my name is", temperature=0.5, max_length=10))


I'm happy to chat with you. It seems like we just started our conversation and I haven't gotten a chance to know your name yet! Would you like to share it with me?
