In [1]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer, BitsAndBytesConfig
from PIL import Image
import requests
import time
from io import BytesIO
from threading import Thread

In [2]:
model_id = "google/medgemma-1.5-4b-it"


In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16, # Cálculos internos rápidos
    bnb_4bit_quant_type="nf4",             # Formato optimizado para precisión
    bnb_4bit_use_double_quant=True         # Comprime aún más los parámetros de cuantización
)

In [4]:

# Cargamos en CPU explícitamente
device = "cpu"

print("Cargando modelo en RAM... (Esto puede tardar un poco)")
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map={"": device},    # Forzamos todo a CPU
    low_cpu_mem_usage=True,     # Optimiza el uso de RAM durante la carga
    trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(model_id)


Cargando modelo en RAM... (Esto puede tardar un poco)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:

url = "https://upload.wikimedia.org/wikipedia/commons/c/c8/Chest_Xray_PA_3-8-2010.png"

# Añadimos un Header para que el servidor no rechace la petición
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

try:
    response = requests.get(url, headers=headers, stream=True)
    response.raise_for_status() # Verifica si la descarga fue exitosa (Error 404, 403, etc.)
    
    # Abrimos la imagen directamente desde el contenido binario
    image = Image.open(BytesIO(response.content)).convert("RGB")
    print("Imagen cargada con éxito.")
    
except requests.exceptions.RequestException as e:
    print(f"Error al descargar la imagen: {e}")
except Exception as e:
    print(f"Error al identificar la imagen: {e}")


Imagen cargada con éxito.


In [None]:

messages = [
    {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Analyze this X-ray."}]}
]

prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)

streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
generation_kwargs = dict(
    **inputs,
    streamer=streamer,
    max_new_tokens=300,
    # eos_token_id=processor.tokenizer.eos_token_id,
    do_sample=False,
    # temperature=0.0
)

thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

print("Generando respuesta en CPU...")
start_time = time.time()

# with torch.no_grad():
#     output_ids = model.generate(**inputs, max_new_tokens=100)

# response = processor.decode(output_ids[0], skip_special_tokens=True)

# 5. Imprimir el texto según llega
print("\n--- Respuesta de MedGemma (Generando en tiempo real) ---\n")
for new_text in streamer:
    print(new_text, end="", flush=True)

thread.join()
print("\n\n--- Generación completada ---")

end_time = time.time()

print(f"\n--- Respuesta ---\n{response}")
print(f"\nTiempo tardado: {end_time - start_time:.2f} segundos.")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Generando respuesta en CPU...

--- Respuesta de MedGemma (Generando en tiempo real) ---



KeyboardInterrupt: 