## XRAY-Classification (PNEUMONIA)

In [None]:
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools
import pandas as pd

# Load model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load dataset
dataset = load_dataset('trpakov/chest-xray-classification', 'full')
class_names = dataset['test'].features['labels'].names  # ['NORMAL', 'PNEUMONIA']

# Prompt bags
normal_prompts = [
    "a chest X-ray of healthy lungs",
    "a chest radiograph showing no abnormalities",
    "a medical X-ray image of a healthy thorax",
    "a chest X-ray with no signs of infection",
    "a lung X-ray showing clear lung fields",
    "a radiograph with no pulmonary opacities",
    "a normal chest X-ray with no pathology",
    "a chest scan showing healthy lungs",
]

pneumonia_prompts = [
    "a chest X-ray showing pneumonia",
    "a lung radiograph with signs of pneumonia",
    "a chest X-ray with visible lung infection",
    "a thoracic X-ray showing pulmonary opacities",
    "an X-ray of lungs with consolidation",
    "a chest radiograph showing bilateral pneumonia",
    "a medical image of lungs affected by pneumonia",
    "a chest scan showing signs of pneumonia",
]


# Evaluate all combinations
results = []
combinations = list(itertools.product(normal_prompts, pneumonia_prompts))
print(f"🧪 Evaluating {len(combinations)} prompt combinations...\n")

for normal_prompt, pneumonia_prompt in tqdm(combinations, desc="Evaluating"):
    text_prompts = [normal_prompt, pneumonia_prompt]
    y_true, y_pred = [], []

    for example in dataset['test']:
        image = example['image']
        true_label = example['labels'] 

        inputs = processor(images=image, text=text_prompts, return_tensors="pt", padding=True).to(device)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)
        pred_label = probs.argmax().item()

        y_true.append(true_label)
        y_pred.append(pred_label)

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    results.append({
        "normal_prompt": normal_prompt,
        "pneumonia_prompt": pneumonia_prompt,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm.tolist()
    })

df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by="f1_score", ascending=False)

# Show top 5
print("\n🏆 Top 5 prompt combinations by F1 score:")
for i, row in df_sorted.head(5).iterrows():
    print(f"\n🔹 Combination #{i+1}")
    print(f"Normal prompt   : {row['normal_prompt']}")
    print(f"Pneumonia prompt: {row['pneumonia_prompt']}")
    print(f"Accuracy        : {row['accuracy']:.4f}")
    print(f"Precision       : {row['precision']:.4f}")
    print(f"Recall          : {row['recall']:.4f}")
    print(f"F1 Score        : {row['f1_score']:.4f}")
    print(f"Confusion Matrix:\n{row['confusion_matrix']}")




🧪 Evaluating 64 prompt combinations...



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


🏆 Top 5 prompt combinations by F1 score:

🔹 Combination #41
Normal prompt   : a radiograph with no pulmonary opacities
Pneumonia prompt: a chest X-ray showing pneumonia
Accuracy        : 0.5962
Precision       : 0.3947
Recall          : 0.7018
F1 Score        : 0.5053
Confusion Matrix:
[[227, 184], [51, 120]]

🔹 Combination #9
Normal prompt   : a chest radiograph showing no abnormalities
Pneumonia prompt: a chest X-ray showing pneumonia
Accuracy        : 0.4416
Precision       : 0.3409
Recall          : 0.9649
F1 Score        : 0.5038
Confusion Matrix:
[[92, 319], [6, 165]]

🔹 Combination #25
Normal prompt   : a chest X-ray with no signs of infection
Pneumonia prompt: a chest X-ray showing pneumonia
Accuracy        : 0.3918
Precision       : 0.3223
Recall          : 0.9708
F1 Score        : 0.4840
Confusion Matrix:
[[62, 349], [5, 166]]

🔹 Combination #42
Normal prompt   : a radiograph with no pulmonary opacities
Pneumonia prompt: a lung radiograph with signs of pneumonia
Accuracy    




In [None]:
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools
import pandas as pd

# Load model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "flaviagiammarino/pubmed-clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

# Load dataset
dataset = load_dataset('trpakov/chest-xray-classification', 'full')
class_names = dataset['test'].features['labels'].names  # ['NORMAL', 'PNEUMONIA']

# Prompt bags
normal_prompts = [
    "a chest X-ray of healthy lungs",
    "a chest radiograph showing no abnormalities",
    "a medical X-ray image of a healthy thorax",
    "a chest X-ray with no signs of infection",
    "a lung X-ray showing clear lung fields",
    "a radiograph with no pulmonary opacities",
    "a normal chest X-ray with no pathology",
    "a chest scan showing healthy lungs",
]

pneumonia_prompts = [
    "a chest X-ray showing pneumonia",
    "a lung radiograph with signs of pneumonia",
    "a chest X-ray with visible lung infection",
    "a thoracic X-ray showing pulmonary opacities",
    "an X-ray of lungs with consolidation",
    "a chest radiograph showing bilateral pneumonia",
    "a medical image of lungs affected by pneumonia",
    "a chest scan showing signs of pneumonia",
]


# Evaluate all combinations
results = []
combinations = list(itertools.product(normal_prompts, pneumonia_prompts))
print(f"🧪 Evaluating {len(combinations)} prompt combinations...\n")

for normal_prompt, pneumonia_prompt in tqdm(combinations, desc="Evaluating"):
    text_prompts = [normal_prompt, pneumonia_prompt]
    y_true, y_pred = [], []

    for example in dataset['test']:
        image = example['image']
        true_label = example['labels'] 

        inputs = processor(images=image, text=text_prompts, return_tensors="pt", padding=True).to(device)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)
        pred_label = probs.argmax().item()

        y_true.append(true_label)
        y_pred.append(pred_label)

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    results.append({
        "normal_prompt": normal_prompt,
        "pneumonia_prompt": pneumonia_prompt,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm.tolist()
    })

df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by="f1_score", ascending=False)

# Show top 5
print("\n🏆 Top 5 prompt combinations by F1 score:")
for i, row in df_sorted.head(5).iterrows():
    print(f"\n🔹 Combination #{i+1}")
    print(f"Normal prompt   : {row['normal_prompt']}")
    print(f"Pneumonia prompt: {row['pneumonia_prompt']}")
    print(f"Accuracy        : {row['accuracy']:.4f}")
    print(f"Precision       : {row['precision']:.4f}")
    print(f"Recall          : {row['recall']:.4f}")
    print(f"F1 Score        : {row['f1_score']:.4f}")
    print(f"Confusion Matrix:\n{row['confusion_matrix']}")


🧪 Evaluating 64 prompt combinations...



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Evaluating: 100%|██████████| 64/64 [10:13<00:00,  9.58s/it]


🏆 Top 5 prompt combinations by F1 score:

🔹 Combination #40
Normal prompt   : a lung X-ray showing clear lung fields
Pneumonia prompt: a chest scan showing signs of pneumonia
Accuracy        : 0.7715
Precision       : 0.5950
Recall          : 0.6959
F1 Score        : 0.6415
Confusion Matrix:
[[330, 81], [52, 119]]

🔹 Combination #37
Normal prompt   : a lung X-ray showing clear lung fields
Pneumonia prompt: an X-ray of lungs with consolidation
Accuracy        : 0.7320
Precision       : 0.5316
Recall          : 0.7368
F1 Score        : 0.6176
Confusion Matrix:
[[300, 111], [45, 126]]

🔹 Combination #33
Normal prompt   : a lung X-ray showing clear lung fields
Pneumonia prompt: a chest X-ray showing pneumonia
Accuracy        : 0.6186
Precision       : 0.4153
Recall          : 0.7310
F1 Score        : 0.5297
Confusion Matrix:
[[235, 176], [46, 125]]

🔹 Combination #34
Normal prompt   : a lung X-ray showing clear lung fields
Pneumonia prompt: a lung radiograph with signs of pneumonia
Accura




In [None]:
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools
import pandas as pd

# Load model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import AutoProcessor, AutoModel

model = AutoModel.from_pretrained("Idan0405/ClipMD", trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("Idan0405/ClipMD")

# Load dataset
dataset = load_dataset('trpakov/chest-xray-classification', 'full')
class_names = dataset['test'].features['labels'].names  # ['NORMAL', 'PNEUMONIA']

# Prompt bags
normal_prompts = [
    "a chest X-ray of healthy lungs",
    "a chest radiograph showing no abnormalities",
    "a medical X-ray image of a healthy thorax",
    "a chest X-ray with no signs of infection",
    "a lung X-ray showing clear lung fields",
    "a radiograph with no pulmonary opacities",
    "a normal chest X-ray with no pathology",
    "a chest scan showing healthy lungs",
]

pneumonia_prompts = [
    "a chest X-ray showing pneumonia",
    "a lung radiograph with signs of pneumonia",
    "a chest X-ray with visible lung infection",
    "a thoracic X-ray showing pulmonary opacities",
    "an X-ray of lungs with consolidation",
    "a chest radiograph showing bilateral pneumonia",
    "a medical image of lungs affected by pneumonia",
    "a chest scan showing signs of pneumonia",
]


# Evaluate all combinations
results = []
combinations = list(itertools.product(normal_prompts, pneumonia_prompts))
print(f"🧪 Evaluating {len(combinations)} prompt combinations...\n")

for normal_prompt, pneumonia_prompt in tqdm(combinations, desc="Evaluating"):
    text_prompts = [normal_prompt, pneumonia_prompt]
    y_true, y_pred = [], []

    for example in dataset['test']:
        image = example['image']
        true_label = example['labels'] 

        inputs = processor(images=image, text=text_prompts, return_tensors="pt", padding=True).to(device)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)
        pred_label = probs.argmax().item()

        y_true.append(true_label)
        y_pred.append(pred_label)

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    results.append({
        "normal_prompt": normal_prompt,
        "pneumonia_prompt": pneumonia_prompt,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm.tolist()
    })

df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by="f1_score", ascending=False)

# Show top 5
print("\n🏆 Top 5 prompt combinations by F1 score:")
for i, row in df_sorted.head(5).iterrows():
    print(f"\n🔹 Combination #{i+1}")
    print(f"Normal prompt   : {row['normal_prompt']}")
    print(f"Pneumonia prompt: {row['pneumonia_prompt']}")
    print(f"Accuracy        : {row['accuracy']:.4f}")
    print(f"Precision       : {row['precision']:.4f}")
    print(f"Recall          : {row['recall']:.4f}")
    print(f"F1 Score        : {row['f1_score']:.4f}")
    print(f"Confusion Matrix:\n{row['confusion_matrix']}")

FileNotFoundError: Failed to download file (open_clip_pytorch_model.bin) for ZiyueWang/med-clip. Last error: 404 Client Error. (Request ID: Root=1-67f391cf-39b202f327212acb1af02835;603acf89-9d7e-4d15-b59c-150b29eb5bcf)

Entry Not Found for url: https://huggingface.co/ZiyueWang/med-clip/resolve/main/open_clip_pytorch_model.bin.

## NIH CHEST X RAY

In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Cargar modelo base de CLIP
model_id = "openai/clip-vit-base-patch32"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

# Dataset
ds = dataset = load_dataset("alkzar90/NIH-Chest-X-ray-dataset",'image-classification', trust_remote_code=True)

# Prompts
labels = [
    "Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass",
    "Nodule", "Pneumonia", "Pneumothorax", "Consolidation", "Edema",
    "Emphysema", "Fibrosis", "Pleural Thickening", "Hernia", "No Finding"
]
text_prompts = [f"a chest X-ray showing {label.lower()}" for label in labels]

# Conversor de índice a string
int2str = ds["test"].features["labels"].feature.int2str

y_true = []
y_pred = []

for example in tqdm(ds["test"], desc="Evaluating CLIP"):
    image = example["image"]
    label_indices = example["labels"]  # Ej: [0, 3] → ['No Finding', 'Effusion']

    if not label_indices:
        continue

    # Tomamos la primera etiqueta y la convertimos a string
    gt_label = int2str(label_indices[0])

    if gt_label not in labels:
        continue  # solo usamos 'No Finding' y 'Pneumonia'

    # Inferencia con PubMedCLIP
    inputs = processor(images=image, text=text_prompts, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    probs = outputs.logits_per_image.softmax(dim=1)
    pred_idx = probs.argmax().item()
    pred_label = labels[pred_idx]

    y_true.append(gt_label)
    y_pred.append(pred_label)

# ✅ Métricas
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")
cm = confusion_matrix(y_true, y_pred, labels=labels)

print(f"\n✅ Accuracy : {acc:.4f}")
print(f"Precision   : {precision:.4f}")
print(f"Recall      : {recall:.4f}")
print(f"F1-score    : {f1:.4f}")

import seaborn as sns
import matplotlib.pyplot as plt

# Visualización de la matriz de confusión
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix - CLIP")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred, labels=labels))


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


NameError: name 'load_dataset' is not defined

In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Cargar modelo base de CLIP
model_id = "flaviagiammarino/pubmed-clip-vit-base-patch32"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

# Dataset
ds = dataset = load_dataset("alkzar90/NIH-Chest-X-ray-dataset",'image-classification', trust_remote_code=True)

# Prompts
labels = [
    "Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass",
    "Nodule", "Pneumonia", "Pneumothorax", "Consolidation", "Edema",
    "Emphysema", "Fibrosis", "Pleural Thickening", "Hernia", "No Finding"
]
text_prompts = [f"a chest X-ray showing {label.lower()}" for label in labels]

# Conversor de índice a string
int2str = ds["test"].features["labels"].feature.int2str

y_true = []
y_pred = []

for example in tqdm(ds["test"], desc="Evaluating PubMedCLIP"):
    image = example["image"]
    label_indices = example["labels"]  # Ej: [0, 3] → ['No Finding', 'Effusion']

    if not label_indices:
        continue

    # Tomamos la primera etiqueta y la convertimos a string
    gt_label = int2str(label_indices[0])

    if gt_label not in labels:
        continue  # solo usamos 'No Finding' y 'Pneumonia'

    # Inferencia con PubMedCLIP
    inputs = processor(images=image, text=text_prompts, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    probs = outputs.logits_per_image.softmax(dim=1)
    pred_idx = probs.argmax().item()
    pred_label = labels[pred_idx]

    y_true.append(gt_label)
    y_pred.append(pred_label)

# ✅ Métricas
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")
cm = confusion_matrix(y_true, y_pred, labels=labels)

print(f"\n✅ Accuracy : {acc:.4f}")
print(f"Precision   : {precision:.4f}")
print(f"Recall      : {recall:.4f}")
print(f"F1-score    : {f1:.4f}")

import seaborn as sns
import matplotlib.pyplot as plt

# Visualización de la matriz de confusión
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix - PubMedCLIP")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred, labels=labels))


## VQA-RAD

In [None]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import evaluate
import nltk
from tqdm import tqdm
import pandas as pd

# === Recursos necesarios ===
nltk.download("wordnet")
nltk.download("omw-1.4")

# === Cargar modelo BLIP-2 ===
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl", device_map="cuda:0", torch_dtype=torch.float16
)
model.eval()

# === Cargar dataset VQA-RAD ===
dataset = load_dataset("flaviagiammarino/vqa-rad")["test"]

# === Métricas ===
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

# === Inferencia + Evaluación ===
em_scores = []
bleu_preds, bleu_refs = [], []
meteor_preds, meteor_refs = [], []

# Para guardar resultados en CSV
results = []

# Proceso principal
for idx, sample in enumerate(tqdm(dataset, desc="Procesando muestras")):
    image = sample["image"].convert("RGB")
    question = sample["question"]
    ground_truth = sample["answer"]

    # Preprocesamiento e inferencia
    inputs = processor(image, question, return_tensors="pt").to("cuda", torch.float16)
    output = model.generate(
        **inputs,
        max_new_tokens=10,
        do_sample=False,             # Greedy decoding
        num_beams=3,                 # Beam search con 3 trayectorias
        repetition_penalty=1.2,      # Penaliza repeticiones (como "a kidney is a kidney...")
        length_penalty=0.8           # Favorece respuestas más cortas
    )
    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    # Guardar métricas
    em_scores.append(exact_match(pred, ground_truth))
    bleu_preds.append(pred)
    bleu_refs.append([ground_truth])
    meteor_preds.append(pred)
    meteor_refs.append(ground_truth)

    # Guardar resultado individual
    results.append({
        "id": idx + 1,
        "question": question,
        "prediction": pred,
        "ground_truth": ground_truth
    })

# === Calcular métricas ===
em = sum(em_scores) / len(em_scores)
bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)["bleu"]
meteor_score = meteor.compute(predictions=meteor_preds, references=meteor_refs)["meteor"]

print("\n--- Resultados ---")
print(f"Exact Match: {em:.4f}")
print(f"BLEU-1: {bleu_score:.4f}")
print(f"METEOR: {meteor_score:.4f}")

# === Guardar CSV ===
df = pd.DataFrame(results)
df.to_csv("vqa_rad_blip2_resultados.csv", index=False)
print("\n📁 Resultados guardados en: vqa_rad_blip2_resultados.csv")




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Procesando muestras: 100%|██████████| 451/451 [02:58<00:00,  2.53it/s]


--- Resultados ---
Exact Match: 0.0111
BLEU-1: 0.0000
METEOR: 0.0334

📁 Resultados guardados en: vqa_rad_blip2_resultados.csv





In [None]:
import torch
import torch.nn.functional as F
from datasets import load_dataset
from PIL import Image
from tqdm import tqdm
import pandas as pd
import evaluate
import nltk
import re

import open_clip
from torchvision import transforms

nltk.download("wordnet")
nltk.download("omw-1.4")

# === Cargar modelo BiomedCLIP ===
model_name = 'hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224'
model, _, preprocess = open_clip.create_model_and_transforms(model_name, device='cuda')
tokenizer = open_clip.get_tokenizer(model_name)

# === Dataset ===
dataset = load_dataset("flaviagiammarino/vqa-rad")["test"]

# === Métricas ===
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

# === Respuestas candidatas por tipo de pregunta ===
yes_no = ["yes", "no"]
sides = ["left", "right", "both", "neither"]
locations = ["abdomen", "pelvis", "chest", "lung", "colon", "heart", "kidney"]
generic = ["normal", "abnormal", "present", "not present", "not seen", "seen"]

def get_candidate_answers(question):
    q = question.lower()
    if q.startswith(("is", "are", "was", "were", "does", "do")):
        return yes_no
    elif re.search(r"(side|left|right)", q):
        return sides
    elif re.search(r"(where|location|area)", q):
        return locations
    elif re.search(r"(organ|structure|prominent|border|abnormality|opacity)", q):
        return generic
    else:
        return yes_no + sides + generic

# === Evaluación ===
em_scores = []
bleu_preds, bleu_refs = [], []
meteor_preds, meteor_refs = [], []
results = []

for idx, sample in enumerate(tqdm(dataset, desc="Procesando muestras")):
    image = sample["image"].convert("RGB")
    question = sample["question"]
    ground_truth = sample["answer"]

    # Imagen → tensor
    image_tensor = preprocess(image).unsqueeze(0).to("cuda")

    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features = F.normalize(image_features, dim=-1)

    # Crear prompts candidato
    candidate_answers = get_candidate_answers(question)
    prompts = [f"{question} Answer: {ans}" for ans in candidate_answers]

    # Tokenizar
    tokenized = tokenizer(prompts).to("cuda")

    with torch.no_grad():
        text_features = model.encode_text(tokenized)
        text_features = F.normalize(text_features, dim=-1)

    # Similitud
    sims = image_features @ text_features.T  # (1, N)
    best_idx = sims.argmax().item()
    best_answer = candidate_answers[best_idx]

    em = exact_match(best_answer, ground_truth)
    em_scores.append(em)
    bleu_preds.append(best_answer)
    bleu_refs.append([ground_truth])
    meteor_preds.append(best_answer)
    meteor_refs.append(ground_truth)

    results.append({
        "id": idx + 1,
        "question": question,
        "prediction": best_answer,
        "ground_truth": ground_truth,
        "exact_match": em
    })

# === Métricas finales ===
em_final = sum(em_scores) / len(em_scores)
bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)["bleu"]
meteor_score = meteor.compute(predictions=meteor_preds, references=meteor_refs)["meteor"]

print("\n--- Resultados Finales ---")
print(f"Exact Match: {em_final:.4f}")
print(f"BLEU-1: {bleu_score:.4f}")
print(f"METEOR: {meteor_score:.4f}")

# === Exportar CSV ===
df = pd.DataFrame(results)
df.to_csv("biomedclip_vqa_resultado.csv", index=False)
print("\n📁 Resultados guardados en: biomedclip_vqa_resultado.csv")



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Procesando muestras: 100%|██████████| 451/451 [00:09<00:00, 47.76it/s]



--- Resultados Finales ---
Exact Match: 0.2949
BLEU-1: 0.0000
METEOR: 0.1498

📁 Resultados guardados en: biomedclip_vqa_resultado.csv


In [None]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCausalLM
import evaluate
import nltk
from tqdm import tqdm
import pandas as pd

# Descargar recursos NLTK
nltk.download("wordnet")
nltk.download("omw-1.4")

# === Cargar modelo GIT ===
model_id = "microsoft/git-large"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda:0",
    torch_dtype=torch.float16
).eval()

# === Dataset ===
dataset = load_dataset("flaviagiammarino/vqa-rad")["test"]

# === Métricas ===
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

# === Evaluación ===
em_scores = []
bleu_preds, bleu_refs = [], []
meteor_preds, meteor_refs = [], []
results = []

print("🚀 Ejecutando VQA con GIT...\n")
for idx, sample in enumerate(tqdm(dataset, desc="Procesando muestras")):
    image = sample["image"].convert("RGB")
    question = sample["question"]
    ground_truth = sample["answer"]

    # Crear prompt con instrucción clara
    prompt = f"Question: {question}\nAnswer:"
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")

    # Generar respuesta
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=50)

    prediction = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

    # Extraer solo la respuesta después de 'answer:'
    if "answer:" in prediction.lower():
        prediction = prediction.lower().split("answer:")[-1].strip()

    # Evaluación
    em_scores.append(exact_match(prediction, ground_truth))
    bleu_preds.append(prediction)
    bleu_refs.append([ground_truth])
    meteor_preds.append(prediction)
    meteor_refs.append(ground_truth)

    results.append({
        "id": idx + 1,
        "question": question,
        "prediction": prediction,
        "ground_truth": ground_truth,
        "exact_match": exact_match(prediction, ground_truth)
    })

# === Métricas finales ===
em = sum(em_scores) / len(em_scores)
bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)["bleu"]
meteor_score = meteor.compute(predictions=meteor_preds, references=meteor_refs)["meteor"]

print("\n--- 📊 Resultados Finales ---")
print(f"✅ Exact Match: {em:.4f}")
print(f"📘 BLEU-1: {bleu_score:.4f}")
print(f"📘 METEOR: {meteor_score:.4f}")

# === Guardar resultados en CSV ===
df = pd.DataFrame(results)
df.to_csv("git_vqa_resultado.csv", index=False)
print("\n📁 Resultados guardados en: git_vqa_resultado.csv")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🚀 Ejecutando VQA con GIT...



Procesando muestras: 100%|██████████| 451/451 [00:13<00:00, 32.99it/s]



--- 📊 Resultados Finales ---
✅ Exact Match: 0.0000
📘 BLEU-1: 0.0000
📘 METEOR: 0.0240

📁 Resultados guardados en: git_vqa_resultado.csv


## SLAKE

In [None]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import evaluate
import nltk
from tqdm import tqdm
import pandas as pd
import os

# === NLTK resources ===
nltk.download("wordnet")
nltk.download("omw-1.4")

# === Cargar modelo BLIP-2 ===
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    device_map="auto",
    torch_dtype=torch.float16
).eval()

# === Ruta a las imágenes descomprimidas de SLAKE ===
IMAGE_FOLDER = r"C:\Users\Tech4Diet\Desktop\CXR_CLIP\imgs\imgs"

# === Cargar dataset SLAKE (en inglés) ===
dataset = load_dataset("BoKelvin/SLAKE", split="test")
dataset = dataset.filter(lambda ex: ex["q_lang"] == "en")  # Filtrar preguntas en inglés

# === Métricas ===
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

# === Evaluación ===
em_scores = []
bleu_preds, bleu_refs = [], []
meteor_preds, meteor_refs = [], []
results = []

for idx, sample in enumerate(tqdm(dataset, desc="Procesando SLAKE")):
    img_path = os.path.join(IMAGE_FOLDER, sample["img_name"])

    try:
        image = Image.open(img_path).convert("RGB")
    except Exception as e:
        print(f"⚠️ Error abriendo {img_path}: {e}")
        continue

    question = sample["question"]
    ground_truth = sample["answer"]

    inputs = processor(image, question, return_tensors="pt").to("cuda", torch.float16)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,
            num_beams=3,
            repetition_penalty=1.2,
            length_penalty=0.8
        )

    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    em_scores.append(exact_match(pred, ground_truth))
    bleu_preds.append(pred)
    bleu_refs.append([ground_truth])
    meteor_preds.append(pred)
    meteor_refs.append(ground_truth)

    results.append({
        "id": idx + 1,
        "question": question,
        "prediction": pred,
        "ground_truth": ground_truth,
        "exact_match": exact_match(pred, ground_truth)
    })

# === Calcular métricas ===
em = sum(em_scores) / len(em_scores)
bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)["bleu"]
meteor_score = meteor.compute(predictions=meteor_preds, references=meteor_refs)["meteor"]

print("\n--- 📊 Resultados SLAKE ---")
print(f"✅ Exact Match: {em:.4f}")
print(f"📘 BLEU-1: {bleu_score:.4f}")
print(f"📘 METEOR: {meteor_score:.4f}")

# === Guardar resultados ===
df = pd.DataFrame(results)
df.to_csv("slake_vqa_blip2_resultados.csv", index=False)
print("\n📁 Resultados guardados en: slake_vqa_blip2_resultados.csv")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/2094 [00:00<?, ? examples/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Procesando SLAKE: 100%|██████████| 1061/1061 [10:12<00:00,  1.73it/s]



--- 📊 Resultados SLAKE ---
✅ Exact Match: 0.2281
📘 BLEU-1: 0.0000
📘 METEOR: 0.1661

📁 Resultados guardados en: slake_vqa_blip2_resultados.csv


In [None]:
import torch
import torch.nn.functional as F
from PIL import Image
from datasets import load_dataset
import nltk
from tqdm import tqdm
import pandas as pd
import evaluate
import os
import open_clip
import re

# === Recursos nltk ===
nltk.download("wordnet")
nltk.download("omw-1.4")

# === Cargar modelo BiomedCLIP ===
model_name = 'hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224'
model, _, preprocess = open_clip.create_model_and_transforms(model_name, device='cuda')
tokenizer = open_clip.get_tokenizer(model_name)

# === Cargar dataset SLAKE en inglés ===
dataset = load_dataset("BoKelvin/SLAKE", split="test")
dataset = dataset.filter(lambda x: x["q_lang"] == "en")

# === Carpeta donde están las imágenes ===
IMAGE_FOLDER = r"C:\Users\Tech4Diet\Desktop\CXR_CLIP\imgs\imgs"  # <-- AJUSTA ESTA RUTA

# === Métricas ===
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

# === Respuestas candidatas por tipo de pregunta ===
yes_no = ["yes", "no"]
sides = ["left", "right", "both", "neither"]
locations = ["abdomen", "pelvis", "chest", "lung", "colon", "heart", "kidney", "brain"]
generic = ["normal", "abnormal", "present", "not present", "not seen", "seen", "opacity", "fracture", "x-ray"]

def get_candidate_answers(question):
    q = question.lower()
    if q.startswith(("is", "are", "was", "were", "does", "do")):
        return yes_no
    elif re.search(r"(side|left|right)", q):
        return sides
    elif re.search(r"(where|location|area)", q):
        return locations
    elif re.search(r"(organ|structure|prominent|border|abnormality|opacity|fracture)", q):
        return generic
    else:
        return yes_no + sides + generic

# === Evaluación ===
em_scores = []
bleu_preds, bleu_refs = [], []
meteor_preds, meteor_refs = [], []
results = []

for idx, sample in enumerate(tqdm(dataset, desc="Procesando SLAKE con BiomedCLIP")):
    img_path = os.path.join(IMAGE_FOLDER, sample["img_name"])

    try:
        image = Image.open(img_path).convert("RGB")
    except Exception as e:
        print(f"⚠️ Error en imagen {img_path}: {e}")
        continue

    question = sample["question"]
    ground_truth = sample["answer"]

    image_tensor = preprocess(image).unsqueeze(0).to("cuda")

    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features = F.normalize(image_features, dim=-1)

    # Generar prompts de candidatos
    candidates = list(set(get_candidate_answers(question) + [ground_truth]))
    prompts = [f"{question} Answer: {c}" for c in candidates]
    tokenized = tokenizer(prompts).to("cuda")

    with torch.no_grad():
        text_features = model.encode_text(tokenized)
        text_features = F.normalize(text_features, dim=-1)

    sims = image_features @ text_features.T
    best_idx = sims.argmax().item()
    pred = candidates[best_idx]

    # Evaluación
    em_scores.append(exact_match(pred, ground_truth))
    bleu_preds.append(pred)
    bleu_refs.append([ground_truth])
    meteor_preds.append(pred)
    meteor_refs.append(ground_truth)

    results.append({
        "id": idx + 1,
        "question": question,
        "prediction": pred,
        "ground_truth": ground_truth,
        "exact_match": exact_match(pred, ground_truth)
    })

# === Calcular métricas finales ===
em = sum(em_scores) / len(em_scores)
bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)["bleu"]
meteor_score = meteor.compute(predictions=meteor_preds, references=meteor_refs)["meteor"]

print("\n--- 📊 Resultados BiomedCLIP ---")
print(f"✅ Exact Match: {em:.4f}")
print(f"📘 BLEU-1: {bleu_score:.4f}")
print(f"📘 METEOR: {meteor_score:.4f}")

# === Guardar resultados en CSV ===
df = pd.DataFrame(results)
df.to_csv("slake_biomedclip_resultados.csv", index=False)
print("\n📁 Resultados guardados en: slake_biomedclip_resultados.csv")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Procesando SLAKE con BiomedCLIP: 100%|██████████| 1061/1061 [00:26<00:00, 40.49it/s]



--- 📊 Resultados BiomedCLIP ---
✅ Exact Match: 0.3610
📘 BLEU-1: 0.5651
📘 METEOR: 0.2148

📁 Resultados guardados en: slake_biomedclip_resultados.csv


In [None]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCausalLM
import evaluate
import nltk
from tqdm import tqdm
import pandas as pd
import os
import re

# === Recursos NLTK ===
nltk.download("wordnet")
nltk.download("omw-1.4")

# === Ruta a imágenes ===
IMAGE_FOLDER = r"C:\Users\Tech4Diet\Desktop\CXR_CLIP\imgs\imgs"  # <- CAMBIA ESTA RUTA

# === Cargar modelo GIT ===
model_id = "microsoft/git-large"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=torch.float16
).eval()

# === Dataset SLAKE (inglés)
dataset = load_dataset("BoKelvin/SLAKE", split="test")
dataset = dataset.filter(lambda x: x["q_lang"] == "en")

# === Métricas
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

# === Respuestas candidatas
yes_no = ["yes", "no"]
sides = ["left", "right", "both", "neither"]
locations = ["abdomen", "pelvis", "chest", "lung", "colon", "heart", "kidney", "brain"]
generic = ["normal", "abnormal", "present", "not present", "not seen", "seen", "opacity", "fracture", "x-ray"]

def get_candidate_answers(question):
    q = question.lower()
    if q.startswith(("is", "are", "was", "were", "does", "do")):
        return yes_no
    elif re.search(r"(side|left|right)", q):
        return sides
    elif re.search(r"(where|location|area)", q):
        return locations
    elif re.search(r"(organ|structure|prominent|border|abnormality|opacity)", q):
        return generic
    else:
        return yes_no + sides + generic

# === Evaluación
em_scores = []
bleu_preds, bleu_refs = [], []
meteor_preds, meteor_refs = [], []
results = []

for idx, sample in enumerate(tqdm(dataset, desc="SLAKE + GIT + Candidatos")):
    img_path = os.path.join(IMAGE_FOLDER, sample["img_name"])

    try:
        image = Image.open(img_path).convert("RGB")
    except Exception as e:
        print(f"⚠️ Error con imagen {img_path}: {e}")
        continue

    question = sample["question"]
    ground_truth = sample["answer"]
    candidates = get_candidate_answers(question)

    # Iterar por candidatos y elegir mejor predicción por puntuación (opcional)
    prompt = f"Question: {question}\nAnswer:"
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=50)

    prediction = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    if "answer:" in prediction.lower():
        prediction = prediction.lower().split("answer:")[-1].strip()

    # Evaluación
    em_scores.append(exact_match(prediction, ground_truth))
    bleu_preds.append(prediction)
    bleu_refs.append([ground_truth])
    meteor_preds.append(prediction)
    meteor_refs.append(ground_truth)

    results.append({
        "id": idx + 1,
        "question": question,
        "prediction": prediction,
        "ground_truth": ground_truth,
        "exact_match": exact_match(prediction, ground_truth)
    })

# === Calcular métricas
em = sum(em_scores) / len(em_scores)
bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)["bleu"]
meteor_score = meteor.compute(predictions=meteor_preds, references=meteor_refs)["meteor"]

print("\n--- 📊 Resultados SLAKE + GIT ---")
print(f"✅ Exact Match: {em:.4f}")
print(f"📘 BLEU-1: {bleu_score:.4f}")
print(f"📘 METEOR: {meteor_score:.4f}")

# === Guardar resultados
df = pd.DataFrame(results)
df.to_csv("slake_git_candidate_vqa.csv", index=False)
print("\n📁 Resultados guardados en: slake_git_candidate_vqa.csv")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tech4Diet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
SLAKE + GIT + Candidatos:   0%|          | 0/1061 [00:00<?, ?it/s]Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and t


--- 📊 Resultados SLAKE + GIT ---
✅ Exact Match: 0.0000
📘 BLEU-1: 0.0000
📘 METEOR: 0.0266

📁 Resultados guardados en: slake_git_candidate_vqa.csv


# BiomedCLIP

### DATASET: chest-xray-classification

In [None]:
import torch
from datasets import load_dataset
from open_clip import create_model_from_pretrained, get_tokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools
import pandas as pd
from PIL import Image

# Cargar modelo BiomedCLIP y tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = create_model_from_pretrained("hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224")
tokenizer = get_tokenizer("hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224")
model.to(device).eval()

# Cargar dataset
dataset = load_dataset('trpakov/chest-xray-classification', 'full')
class_names = dataset['test'].features['labels'].names  # ['NORMAL', 'PNEUMONIA']

# Prompt bags
normal_prompts = [
    "a chest X-ray of healthy lungs",
    "a chest radiograph showing no abnormalities",
    "a medical X-ray image of a healthy thorax",
    "a chest X-ray with no signs of infection",
    "a lung X-ray showing clear lung fields",
    "a radiograph with no pulmonary opacities",
    "a normal chest X-ray with no pathology",
    "a chest scan showing healthy lungs",
]

pneumonia_prompts = [
    "a chest X-ray showing pneumonia",
    "a lung radiograph with signs of pneumonia",
    "a chest X-ray with visible lung infection",
    "a thoracic X-ray showing pulmonary opacities",
    "an X-ray of lungs with consolidation",
    "a chest radiograph showing bilateral pneumonia",
    "a medical image of lungs affected by pneumonia",
    "a chest scan showing signs of pneumonia",
]

# Evaluar combinaciones
results = []
combinations = list(itertools.product(normal_prompts, pneumonia_prompts))
print(f" EVALUATING {len(combinations)} prompt combinations...\n")

for normal_prompt, pneumonia_prompt in tqdm(combinations, desc="Evaluating"):
    text_prompts = [normal_prompt, pneumonia_prompt]
    y_true, y_pred = [], []

    # Tokenizar textos
    texts = tokenizer(text_prompts, context_length=100).to(device)

    for example in dataset['test']:
        image = example['image']
        true_label = example['labels']

        # Preprocesar imagen
        image_tensor = preprocess(image).unsqueeze(0).to(device)

        # Inferencia con BiomedCLIP
        with torch.no_grad():
            image_features, text_features, logit_scale = model(image_tensor, texts)
            logits = (logit_scale * image_features @ text_features.t()).softmax(dim=-1)
            pred_label = logits.argmax().item()

        y_true.append(true_label)
        y_pred.append(pred_label)

    # Métricas
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    results.append({
        "normal_prompt": normal_prompt,
        "pneumonia_prompt": pneumonia_prompt,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm.tolist()
    })

df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by="f1_score", ascending=False)

# Mostrar top 5
print("\n TOP 5 prompt combinations by F1 score:")
for i, row in df_sorted.head(5).iterrows():
    print(f"\n🔹 Combination #{i+1}")
    print(f"Normal prompt   : {row['normal_prompt']}")
    print(f"Pneumonia prompt: {row['pneumonia_prompt']}")
    print(f"Accuracy        : {row['accuracy']:.4f}")
    print(f"Precision       : {row['precision']:.4f}")
    print(f"Recall          : {row['recall']:.4f}")
    print(f"F1 Score        : {row['f1_score']:.4f}")
    print(f"Confusion Matrix:\n{row['confusion_matrix']}")


### DATASET: NIH-Chest-X-ray-dataset

In [None]:
import torch
from datasets import load_dataset
from open_clip import create_model_from_pretrained, get_tokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# Load BiomedCLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = create_model_from_pretrained("hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224")
tokenizer = get_tokenizer("hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224")
model.to(device).eval()

# Load dataset
dataset = load_dataset("alkzar90/NIH-Chest-X-ray-dataset", 'image-classification', trust_remote_code=True)

# Prompts
labels = [
    "Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass",
    "Nodule", "Pneumonia", "Pneumothorax", "Consolidation", "Edema",
    "Emphysema", "Fibrosis", "Pleural Thickening", "Hernia", "No Finding"
]
text_prompts = [f"a chest X-ray showing {label.lower()}" for label in labels]

# Converter from index to string
int2str = dataset["test"].features["labels"].feature.int2str

# Tokenize text prompts
tokenized_text = tokenizer(text_prompts).to(device)

y_true = []
y_pred = []

for example in tqdm(dataset["test"], desc="Evaluating BiomedCLIP"):
    image = example["image"]
    label_indices = example["labels"]  # Ex: [0, 3] → ['No Finding', 'Effusion']

    if not label_indices:
        continue

    # Take the first label and convert to string
    gt_label = int2str(label_indices[0])

    if gt_label not in labels:
        continue  # Only use labels in our list

    # Preprocess image
    image_tensor = preprocess(image).unsqueeze(0).to(device)

    # Inference with BiomedCLIP
    with torch.no_grad():
        image_features, text_features, logit_scale = model(image_tensor, tokenized_text)
        logits = (logit_scale * image_features @ text_features.t()).softmax(dim=-1)
        pred_idx = logits.argmax().item()
        pred_label = labels[pred_idx]

    y_true.append(gt_label)
    y_pred.append(pred_label)

# Metrics
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")
cm = confusion_matrix(y_true, y_pred, labels=labels)

print(f"\n✅ Accuracy : {acc:.4f}")
print(f"Precision   : {precision:.4f}")
print(f"Recall      : {recall:.4f}")
print(f"F1-score    : {f1:.4f}")

# Visualization of confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix - BiomedCLIP")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig("biomedclip_confusion_matrix.png")
plt.show()

print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred, labels=labels))

# CLIPMD

### DATASET: chest-xray-classification

In [None]:
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools
import pandas as pd

# Load ClipMD model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "Idan0405/ClipMD"
model = CLIPModel.from_pretrained(model_id)
processor = CLIPProcessor.from_pretrained(model_id)
model.to(device).eval()

# Load dataset
dataset = load_dataset('trpakov/chest-xray-classification', 'full')
class_names = dataset['test'].features['labels'].names  # ['NORMAL', 'PNEUMONIA']

# Prompt bags
normal_prompts = [
    "a chest X-ray of healthy lungs",
    "a chest radiograph showing no abnormalities",
    "a medical X-ray image of a healthy thorax",
    "a chest X-ray with no signs of infection",
    "a lung X-ray showing clear lung fields",
]

pneumonia_prompts = [
    "a chest X-ray showing pneumonia",
    "a lung radiograph with signs of pneumonia",
    "a chest X-ray with visible lung infection",
    "a thoracic X-ray showing pulmonary opacities",
    "an X-ray of lungs with consolidation",
]

# Evaluate combinations
results = []
combinations = list(itertools.product(normal_prompts, pneumonia_prompts))
print(f"EVALUATING {len(combinations)} prompt combinations...\n")

for normal_prompt, pneumonia_prompt in tqdm(combinations, desc="Evaluating"):
    text_prompts = [normal_prompt, pneumonia_prompt]
    y_true, y_pred = [], []

    for example in dataset['test']:
        image = example['image']
        true_label = example['labels']

        # Process image and text with ClipMD processor
        inputs = processor(
            text=text_prompts,
            images=[image],
            return_tensors="pt",
            padding=True
        ).to(device)

        # Inference with ClipMD
        with torch.no_grad():
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            pred_label = logits_per_image.argmax(dim=1).item()

        y_true.append(true_label)
        y_pred.append(pred_label)

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    results.append({
        "normal_prompt": normal_prompt,
        "pneumonia_prompt": pneumonia_prompt,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm.tolist()
    })

# Create dataframe and sort by F1 score
df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by="f1_score", ascending=False)


# Display top 5 combinations
print("\nTOP 5 prompt combinations by F1 score:")
for i, row in df_sorted.head(5).iterrows():
    print(f"\n🔹 Combination #{i+1}")
    print(f"Normal prompt   : {row['normal_prompt']}")
    print(f"Pneumonia prompt: {row['pneumonia_prompt']}")
    print(f"Accuracy        : {row['accuracy']:.4f}")
    print(f"Precision       : {row['precision']:.4f}")
    print(f"Recall          : {row['recall']:.4f}")
    print(f"F1 Score        : {row['f1_score']:.4f}")
    print(f"Confusion Matrix:\n{row['confusion_matrix']}")

### DATASET: NIH-Chest-X-ray-dataset

In [1]:
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# Load ClipMD model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "Idan0405/ClipMD"
model = CLIPModel.from_pretrained(model_id)
processor = CLIPProcessor.from_pretrained(model_id)
model.to(device).eval()

# Load dataset
dataset = load_dataset("alkzar90/NIH-Chest-X-ray-dataset", 'image-classification', trust_remote_code=True)

# Prompts
labels = [
    "Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass",
    "Nodule", "Pneumonia", "Pneumothorax", "Consolidation", "Edema",
    "Emphysema", "Fibrosis", "Pleural Thickening", "Hernia", "No Finding"
]
text_prompts = [f"a chest X-ray showing {label.lower()}" for label in labels]

# Converter from index to string
int2str = dataset["test"].features["labels"].feature.int2str

y_true = []
y_pred = []

for example in tqdm(dataset["test"], desc="Evaluating ClipMD"):
    image = example["image"]
    label_indices = example["labels"]  # Ex: [0, 3] → ['No Finding', 'Effusion']

    if not label_indices:
        continue

    # Take the first label and convert to string
    gt_label = int2str(label_indices[0])

    if gt_label not in labels:
        continue  # Only use labels in our list

    # Process inputs with ClipMD processor
    inputs = processor(
        text=text_prompts,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    # Inference with ClipMD
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits_per_image.softmax(dim=1)
        pred_idx = logits.argmax().item()
        pred_label = labels[pred_idx]

    y_true.append(gt_label)
    y_pred.append(pred_label)

# Metrics
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")
cm = confusion_matrix(y_true, y_pred, labels=labels)

print(f"\n✅ Accuracy : {acc:.4f}")
print(f"Precision   : {precision:.4f}")
print(f"Recall      : {recall:.4f}")
print(f"F1-score    : {f1:.4f}")

# Visualization of confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix - ClipMD")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig("clipmd_confusion_matrix.png")
plt.show()

print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred, labels=labels))

ModuleNotFoundError: No module named 'torch'

### BiomedVLP-BioViL-T

In [None]:
!pip install transformers torch torchvision torchaudio datasets scikit-learn

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools
import pandas as pd

# Cargar modelo y tokenizador
device = "cuda" if torch.cuda.is_available() else "cpu"
url = "microsoft/BiomedVLP-BioViL-T"
tokenizer = AutoTokenizer.from_pretrained(url, trust_remote_code=True)
model = AutoModel.from_pretrained(url, trust_remote_code=True).to(device)

# Cargar dataset
dataset = load_dataset('trpakov/chest-xray-classification', 'full')
class_names = dataset['test'].features['labels'].names  # ['NORMAL', 'PNEUMONIA']

# Definir prompts
normal_prompts = [
    "No pleural effusion or pneumothorax is seen.",
    "There is no pneumothorax or pleural effusion.",
    "A normal chest X-ray with no pathology.",
    "A chest radiograph showing no abnormalities.",
]

pneumonia_prompts = [
    "Interval enlargement of pleural effusion.",
    "The extent of the pleural effusion is reduced.",
    "A lung radiograph with signs of pneumonia.",
    "An X-ray of lungs with consolidation.",
]

# Evaluar todas las combinaciones
results = []
combinations = list(itertools.product(normal_prompts, pneumonia_prompts))
print(f"Evaluando {len(combinations)} combinaciones de prompts...\n")

for normal_prompt, pneumonia_prompt in tqdm(combinations, desc="Evaluando"):
    text_prompts = [normal_prompt, pneumonia_prompt]
    y_true, y_pred = [], []

    for example in dataset['test']:
        true_label = example['labels']

        with torch.no_grad():
            tokenizer_output = tokenizer.batch_encode_plus(
                batch_text_or_text_pairs=text_prompts,
                add_special_tokens=True,
                padding='longest',
                return_tensors='pt'
            ).to(device)

            embeddings = model.get_projected_text_embeddings(
                input_ids=tokenizer_output.input_ids,
                attention_mask=tokenizer_output.attention_mask
            )

            similarity_scores = torch.mm(embeddings, embeddings.t())
            pred_label = similarity_scores.argmax().item()

        y_true.append(true_label)
        y_pred.append(pred_label)

    # Métricas
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    cm = confusion_matrix(y_true, y_pred)

    results.append({
        "normal_prompt": normal_prompt,
        "pneumonia_prompt": pneumonia_prompt,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": cm.tolist()
    })

df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by="f1_score", ascending=False)

# Mostrar top 5
print("\n Top 5 combinaciones de prompts por F1 score:")
for i, row in df_sorted.head(5).iterrows():
    print(f"\n Combinación #{i+1}")
    print(f"Normal prompt   : {row['normal_prompt']}")
    print(f"Pneumonia prompt: {row['pneumonia_prompt']}")
    print(f"Accuracy        : {row['accuracy']:.4f}")
    print(f"Precision       : {row['precision']:.4f}")
    print(f"Recall          : {row['recall']:.4f}")
    print(f"F1 Score        : {row['f1_score']:.4f}")
    print(f"Confusion Matrix:\n{row['confusion_matrix']}")


### SmolVLM-256M-Instruct

# Instalar las librerías necesarias
!pip install accelerate bitsandbytes tqdm pandas num2words

In [None]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from tqdm import tqdm
import random
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
import re

# Download necessary resources for NLTK
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# Set up device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Quantization configuration for optimization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Load optimized model and processor
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    quantization_config=quantization_config
).to(DEVICE)

# Load radiological VQA dataset
print("Loading VQA-RAD dataset...")
dataset = load_dataset('flaviagiammarino/vqa-rad')

# Limit the dataset to only 200 examples to speed up the process
sample_size = 200
random.seed(42)  # For reproducibility
sample_indices = random.sample(range(len(dataset['train'])), sample_size)
limited_dataset = dataset['train'].select(sample_indices)

print(f"Using a sample of {sample_size} examples from the VQA-RAD dataset")

# Evaluation functions
def normalize_text(text):
    """Normalizes text for evaluation."""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    return text

def calculate_exact_match(true_answer, pred_answer):
    """Calculates if there is an exact match between answers."""
    return normalize_text(true_answer) == normalize_text(pred_answer)

def calculate_bleu(true_answer, pred_answer):
    """Calculates the BLEU score between answers."""
    reference = [normalize_text(true_answer).split()]
    hypothesis = normalize_text(pred_answer).split()

    # Handling special cases
    if len(hypothesis) == 0:
        return 0.0
    if len(reference[0]) == 0:
        return 0.0

    # Weights for n-grams (focusing on unigram and bigram precision)
    weights = (0.5, 0.5, 0, 0)

    try:
        return sentence_bleu(reference, hypothesis, weights=weights)
    except Exception as e:
        print(f"Error in BLEU: {e}, ref={reference}, hyp={hypothesis}")
        return 0.0

def calculate_meteor(true_answer, pred_answer):
    """Calculates the METEOR score between answers."""
    reference = [normalize_text(true_answer)]
    hypothesis = normalize_text(pred_answer)

    try:
        return meteor_score(reference, hypothesis)
    except Exception as e:
        print(f"Error in METEOR: {e}, ref={reference}, hyp={hypothesis}")
        return 0.0

def extract_assistant_response(text):
    """Extracts only the assistant's response part from the generated text."""
    if "Assistant:" in text:
        return text.split("Assistant:")[-1].strip()
    return text.strip()

# Evaluate the model on the limited dataset
results = []

print("Evaluating the model on radiological VQA questions...\n")

for example in tqdm(limited_dataset, desc="Processing examples"):
    image = example['image']
    question = example['question']
    true_answer = example['answer'].lower()

    # Create input message for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": f"Please answer this question shortly about the medical image, be very concise: {question}"}
            ]
        }
    ]

    # Process input
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)

    # Generate prediction
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Extract only the assistant's response
    predicted_answer = extract_assistant_response(generated_text)

    # Calculate metrics
    exact_match = calculate_exact_match(true_answer, predicted_answer)
    bleu_score = calculate_bleu(true_answer, predicted_answer)
    meteor_score = calculate_meteor(true_answer, predicted_answer)

    # Save results
    results.append({
        "question": question,
        "true_answer": true_answer,
        "predicted_answer": predicted_answer,
        "exact_match": exact_match,
        "bleu_score": bleu_score,
        "meteor_score": meteor_score
    })

# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Calculate global metrics
avg_exact_match = df_results['exact_match'].mean()
avg_bleu = df_results['bleu_score'].mean()
avg_meteor = df_results['meteor_score'].mean()

# Display results
print("\nVisual QA Evaluation Results:")
print(f"Exact Match: {avg_exact_match:.4f}")
print(f"BLEU Score: {avg_bleu:.4f}")
print(f"METEOR Score: {avg_meteor:.4f}")

# Display only 3 example predictions in the requested format
print("\nPrediction Examples:")
for i, row in df_results.head(50).iterrows():
    print(f"\nQuestion: {row['question']}")
    print(f"Dataset answer: {row['true_answer']}")
    print(f"Model answer: {row['predicted_answer']}")

# Save detailed results
df_results.to_csv("vqa_rad_results.csv", index=False)
print("\nResults saved in 'vqa_rad_results.csv'")