In [1]:
%pip install transformers torch datasets pandas

Collecting transformers
  Using cached transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Using cached torch-2.5.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.2.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Using cached reques

In [8]:
from transformers import (
	AutoImageProcessor,
	ResNetForImageClassification,
	GPT2LMHeadModel,
	AutoTokenizer,
	DataCollatorWithPadding,
)
import torch
from datasets import Dataset
#from PIL import Image
from torch.utils.data import DataLoader, Subset
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import requests
from io import BytesIO


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
def group_images(data,df_image):
	"""
	Group images by UID
	"""
	grouped_imgs = []
	j = 0
	for i in range(len(data)):
		if not grouped_imgs:
			grouped_imgs.append({'imgs': data[i]['imgs'], 'captions': data[i]['captions']})
		else:
			if df_image.iloc[i]['uid'] == df_image.iloc[i - 1]['uid']:
				grouped_imgs[j]['imgs']+=','+data[i]['imgs']
			else:
				grouped_imgs.append({'imgs': data[i]['imgs'], 'captions': data[i]['captions']})
				j += 1
	return grouped_imgs

In [None]:
# Load the CSV files into pandas DataFrames
df_image = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')
df_report = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv')
# Create a DataFrame for images and captions


data = []
for i in range(len(df_image)):
	uid = df_image.iloc[i]['uid']
	image = df_image.iloc[i]['filename']
	index = df_report.loc[df_report['uid'] == uid]
	
	if not index.empty:    
		index = index.index[0]
		caption = df_report.iloc[index]['findings']
		if isinstance(caption, float):  # Skip rows with missing captions
			continue
		data.append({'imgs': image, 'captions': caption})
print(data[0]['imgs'])

data = group_images(data,df_image)

# Update image paths
loc = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized/'
for i in range(len(data)):
	for j in range(len(data[i]['imgs'])):
		data[i]['imgs'][j] = loc + data[i]['imgs'][j]

# Convert to a DataFrame
df = pd.DataFrame(data)

# Convert pandas DataFrame to a Dataset object
#dataset = Dataset.from_pandas(df)

df_image :
column uid of type <class 'str'>
column filename of type <class 'str'>
column projection of type <class 'str'>
df_report :
column uid of type <class 'str'>
column MeSH of type <class 'str'>
column Problems of type <class 'str'>
column image of type <class 'str'>
column indication of type <class 'str'>
column comparison of type <class 'str'>
column findings of type <class 'str'>
column impression of type <class 'str'>
1_IM-0001-4001.dcm.png


# Loading pretrained models

In [35]:
# Load ResNet-50 for feature extraction (frozen)
resnet_model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50").to(device)
resnet_model.eval()  # We won't train the ResNet, just use it for feature extraction

# Load GPT-2 for language generation
gpt2_model_name = "gpt2"  # or "distilgpt2" for a lighter version
tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name)
# GPT-2 doesn't have a pad token by default, let's assign one:
tokenizer.pad_token = tokenizer.eos_token



# Preprocessing data

In [36]:
# Processor for ResNet images
image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
def preprocess_images_and_captions(example):
	black_image = Image.new('RGB', (224, 224), color = 'black')
	#open the images in example['imgs'] and concatenate them
	
	total_width = 224*5
	total_height = 224
	image = Image.new('RGB', (total_width, total_height))
	x_offset = 0
	#split example['imgs'] at each comma and put the images in the images list
	images = example['imgs'].split(',')
	for i in example['imgs']:
		img = Image.open(i)
		img = img.resize((224, 224))
		image.paste(img, (x_offset, 0))
		x_offset += 224
	for i in range(5 - len(example['imgs'])):
		image.paste(black_image, (x_offset, 0))
		x_offset += 224
	

	# Process the image
	image = image.convert("L")  # Convert to grayscale
	image = Image.merge("RGB", [image, image, image])  # Convert grayscale to RGB
	image_inputs = image_processor(image, return_tensors="pt")
	pixel_values = image_inputs["pixel_values"].squeeze(0)  # Shape [3, 224, 224]

	# Tokenize the caption
	text_inputs = tokenizer(
		example["captions"],
		truncation=True,
		max_length=32,  # Adjust as needed
		return_tensors="pt"
	)

	return {
		"pixel_values": pixel_values.tolist(),  # Convert tensor to list
		"input_ids": text_inputs["input_ids"].squeeze(0).tolist(),  # Convert tensor to list
		"attention_mask": text_inputs["attention_mask"].squeeze(0).tolist(),  # Convert tensor to list
	}

In [37]:
split = dataset.train_test_split(test_size=0.1, seed=42)  # 60% train, 40% test

training_dataset = split['train']
testing_dataset = split['test']

training_dataset = training_dataset.map(preprocess_images_and_captions)
testing_dataset = testing_dataset.map(preprocess_images_and_captions)

Map:   0%|          | 0/5822 [00:00<?, ? examples/s]

Map:   0%|          | 0/647 [00:00<?, ? examples/s]

In [38]:
text_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

def combined_collate_fn(batch):
	# Convert lists back to tensors
	pixel_values_list = [torch.tensor(item["pixel_values"]) for item in batch]
	input_ids_list = [torch.tensor(item["input_ids"]) for item in batch]
	attention_mask_list = [torch.tensor(item["attention_mask"]) for item in batch]

	# Debugging step
	for i, pv in enumerate(pixel_values_list):
		print(f"Sample {i}: type={type(pv)}, shape={pv.shape}")

	# Stack pixel values into a single tensor
	pixel_values = torch.stack(pixel_values_list, dim=0)  # [batch_size, 3, 224, 224]

	# Use Hugging Face DataCollatorWithPadding for tokenized text
	text_batch = {
		"input_ids": input_ids_list,
		"attention_mask": attention_mask_list,
	}
	text_batch = text_collator(text_batch)

	# Add pixel values to the text batch
	text_batch["pixel_values"] = pixel_values
	return text_batch

dataloader = DataLoader(
	training_dataset,
	batch_size=10,         # only 2 samples in this example
	shuffle=True,
	collate_fn=combined_collate_fn,
	drop_last=True,      # can be True if you have many samples
)

In [39]:
class FeatureToCaption(nn.Module):
	"""
	We:
	  - Extract features from ResNet (outside this class, in the training loop, frozen)
	  - Project them to GPT-2 hidden dim
	  - Sum them with the GPT-2 token embeddings
	"""
	def __init__(self, feature_dim=2048, hidden_dim=768, gpt2_name="gpt2"):
		super().__init__()
		self.linear = nn.Linear(feature_dim, hidden_dim)
		self.llm = GPT2LMHeadModel.from_pretrained(gpt2_name)
		# Because GPT-2 doesn't define pad_token by default
		self.llm.config.pad_token_id = tokenizer.eos_token_id

	def forward(self, resnet_features, input_ids, attention_mask):
		"""
		resnet_features: [batch_size, feature_dim]
		input_ids:       [batch_size, seq_len]
		attention_mask:  [batch_size, seq_len]
		"""
		# 1) Project the ResNet features to GPT-2 hidden size
		#    shape: [batch_size, hidden_dim]
		projected = self.linear(resnet_features)

		# 2) Expand them along seq_len dimension
		#    shape: [batch_size, 1, hidden_dim] -> [batch_size, seq_len, hidden_dim]
		batch_size, seq_len = input_ids.shape
		projected = projected.unsqueeze(1).expand(batch_size, seq_len, -1)

		# 3) GPT-2 token embeddings
		#    shape: [batch_size, seq_len, hidden_dim]
		token_embeds = self.llm.transformer.wte(input_ids)

		# 4) Sum them (the simplest approach)
		inputs_embeds = token_embeds + projected

		# 5) Forward pass through GPT-2
		outputs = self.llm(
			inputs_embeds=inputs_embeds,
			attention_mask=attention_mask,
			labels=input_ids,  # for CrossEntropyLoss
		)
		return outputs


In [40]:
# Instantiate our feature-to-caption model
model = FeatureToCaption(gpt2_name=gpt2_model_name).to(device)

################################################################################
# Training Loop (Minimal Example)
################################################################################
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
epochs = 3

for epoch in range(1, epochs + 1):
	model.train()
	total_loss = 0.0

	for batch in dataloader:
		# batch has "pixel_values", "input_ids", "attention_mask"
		pixel_values = batch["pixel_values"].to(device)      # [batch_size, 3, 224, 224]
		input_ids = batch["input_ids"].to(device)            # [batch_size, seq_len]
		attention_mask = batch["attention_mask"].to(device)  # [batch_size, seq_len]

		# -------------------- Freeze ResNet & Extract Features -------------------
		with torch.no_grad():
			# 1) Embeddings
			emb_out = resnet_model.resnet.embedder(pixel_values)
			# 2) Encoder
			enc_out = resnet_model.resnet.encoder(emb_out)
			# 3) Pool & Flatten -> shape: [batch_size, 2048]
			pooled_features = resnet_model.resnet.pooler(enc_out.last_hidden_state).flatten(1)
		# -------------------------------------------------------------------------

		# Forward pass
		outputs = model(pooled_features, input_ids, attention_mask)
		loss = outputs.loss

		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		total_loss += loss.item()

	avg_loss = total_loss / len(dataloader)
	print(f"Epoch {epoch}/{epochs}, Loss: {avg_loss:.4f}")

Sample 0: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 1: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 2: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 3: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 4: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 5: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 6: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 7: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 8: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 9: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 0: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 1: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 2: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample 3: type=<class 'torch.Tensor'>, shape=torch.Size([3, 224, 224])
Sample

In [51]:
#torch.save(model, "/kaggle/working/v2.pt")

# ---- Loading ----
#model = torch.load("/kaggle/working/v1.pt")
#model.eval()


v1.pt


## Testing the model

In [42]:
model.eval()

for sample in training_dataset:
	with torch.no_grad():
		# Convert pixel_values back to tensor
		pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0).to(device)  # Add batch dimension
		#print(pixel_values.shape)  # Debugging
	
		# Convert input_ids and attention_mask to tensors
		input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(device)
		attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device)
	
		# ResNet Features
		emb_out = resnet_model.resnet.embedder(pixel_values)
		enc_out = resnet_model.resnet.encoder(emb_out)
		pooled_features = resnet_model.resnet.pooler(enc_out.last_hidden_state).flatten(1)
	
		# Generate a caption
		outputs = model.llm.generate(
            inputs_embeds=(model.linear(pooled_features).unsqueeze(1) + model.llm.transformer.wte(input_ids)),
            attention_mask=attention_mask,
            max_length=100,
            num_beams=2,
            temperature = 1.2,
            top_k = 50,
            top_p = 0.9
            do_sample = True
            repetition_penalty = True
            no_repeat_ngram_size = 3
        )
	
	
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print("Generated caption:", generated_text)
	print("Original caption:", sample["captions"])
	
	
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print("Generated caption:", generated_text)
	print("Original caption:", sample["captions"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated caption: 
Original caption: The heart is normal in size. The mediastinum is unremarkable. The lungs are clear.
Generated caption: 
Original caption: The heart is normal in size. The mediastinum is unremarkable. The lungs are clear.
Generated caption: 
Original caption: Lungs are clear. Heart is normal size. Trachea is midline. No pneumothorax. No large pleural effusion.
Generated caption: 
Original caption: Lungs are clear. Heart is normal size. Trachea is midline. No pneumothorax. No large pleural effusion.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated caption: iosis of the thoracic spine. Cardiomediastinal silhouette is within normal limits. No focal consolidation. No pneumothorax or pleural effusion.
Original caption: The lung volumes are low. However as compared to the prior study, there are increased perihilar opacities bilaterally. Stable dextroscoliosis of the lower thoracic spine with limited evaluation of the spinal XXXX. The tracheostomy tube is in unchanged position.
Generated caption: iosis of the thoracic spine. Cardiomediastinal silhouette is within normal limits. No focal consolidation. No pneumothorax or pleural effusion.
Original caption: The lung volumes are low. However as compared to the prior study, there are increased perihilar opacities bilaterally. Stable dextroscoliosis of the lower thoracic spine with limited evaluation of the spinal XXXX. The tracheostomy tube is in unchanged position.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated caption: orax or large pleural effusion.
Original caption: The cardiac and mediastinal contours are within normal limits. Lungs are well-inflated and clear. There is no focal consolidation, pneumothorax or effusion. No acute bony abnormalities are seen.
Generated caption: orax or large pleural effusion.
Original caption: The cardiac and mediastinal contours are within normal limits. Lungs are well-inflated and clear. There is no focal consolidation, pneumothorax or effusion. No acute bony abnormalities are seen.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated caption: ting. The hilar contour is within normal limits.
Original caption: The heart size is normal. Lungs are clear. There is no pleural line to suggest pneumothorax or costophrenic XXXX blunting to suggest large pleural effusion. Bony structures are within normal limits.
Generated caption: ting. The hilar contour is within normal limits.
Original caption: The heart size is normal. Lungs are clear. There is no pleural line to suggest pneumothorax or costophrenic XXXX blunting to suggest large pleural effusion. Bony structures are within normal limits.


KeyboardInterrupt: 

# COMPUTE EVALUATION METRICS

## blue score

In [None]:
!pip install sacrebleu


from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import sacrebleu
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

model.eval()

smooth_fn = SmoothingFunction().method1  # A smoothing function to avoid zero scores on short sentences

for sample in training_dataset:
	with torch.no_grad():
		# Convert pixel_values back to tensor
		pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0).to(device)
		
		# Convert input_ids and attention_mask to tensors
		input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(device)
		attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device)
	
		# ResNet Features
		emb_out = resnet_model.resnet.embedder(pixel_values)
		enc_out = resnet_model.resnet.encoder(emb_out)
		pooled_features = resnet_model.resnet.pooler(enc_out.last_hidden_state).flatten(1)
	
		# Generate a caption
		outputs = model.llm.generate(
            inputs_embeds=(model.linear(pooled_features).unsqueeze(1) + model.llm.transformer.wte(input_ids)),
            attention_mask=attention_mask,
            max_length=100,
            num_beams=2,
            temperature = 1.2,
            top_k = 50,
            top_p = 0.9
            do_sample = True
            repetition_penalty = True
            no_repeat_ngram_size = 3
        )
	
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	
	# Print for debugging
	print("Generated caption:", generated_text)
	print("Original caption:", sample["captions"])
	
	# ---- Compute BLEU for this single example ----
	# Assume 'sample["captions"]' is a string (single reference). 
	# If you have multiple references, see below for how to handle them.
	reference_str = sample["captions"]   # e.g. "A dog playing with a ball."
	hypothesis_str = generated_text      # e.g. "A dog is playing with a toy."

	# Tokenize
	reference_tokens = word_tokenize(reference_str.lower())   # or use a custom tokenizer
	hypothesis_tokens = word_tokenize(hypothesis_str.lower())
	
	# NLTK’s sentence_bleu expects a list of reference lists.
	# If you have exactly one reference per sample, do: [[reference_tokens]]
	score = sentence_bleu([reference_tokens], 
						  hypothesis_tokens, 
						  smoothing_function=smooth_fn)
	
	print("BLEU (sentence-level):", score)
	print("------------------------------------")


In [64]:
#!pip install git+https://github.com/salaniz/pycocoevalcap.git
evalRefs = []
evalHyps = []

model.eval()
i=0
for idx, sample in enumerate(training_dataset):
	i+=1
	if i == 100:
		break
	with torch.no_grad():
		# Convert pixel_values back to tensor
		pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0).to(device)
		
		# Convert input_ids and attention_mask to tensors
		input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(device)
		attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device)
	
		# ResNet Features
		emb_out = resnet_model.resnet.embedder(pixel_values)
		enc_out = resnet_model.resnet.encoder(emb_out)
		pooled_features = resnet_model.resnet.pooler(enc_out.last_hidden_state).flatten(1)
	
		# Generate a caption
		outputs = model.llm.generate(
			inputs_embeds=(model.linear(pooled_features).unsqueeze(1) + model.llm.transformer.wte(input_ids)),
			attention_mask=attention_mask,
			max_length=100,
			#num_beams=2,
		)
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	
	# Convert references into the required format
	# If sample["captions"] is a single string:
	if isinstance(sample["captions"], str):
		gt_captions = [sample["captions"]]
	else:
		gt_captions = sample["captions"]
	
	# Append references
	# pycocoevalcap expects something like:
	# {"image_id": <id>, "caption": "some reference caption"}
	for ref in gt_captions:
		evalRefs.append({
			"image_id": idx,
			"caption": ref
		})

	# Append hypothesis
	evalHyps.append({
		"image_id": idx,
		"caption": generated_text
	})

print(f"Number of references: {len(evalRefs)}")
print(f"Number of hypotheses: {len(evalHyps)}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Number of references: 99
Number of hypotheses: 99


In [65]:
from collections import defaultdict

from pycocoevalcap.cider.cider import Cider
gts = defaultdict(list)
res = {}

for ref in evalRefs:
	gts[ref["image_id"]].append(ref["caption"])

for hyp in evalHyps:
	res[hyp["image_id"]] = [hyp["caption"]]  # Wrap in a list to match the expected format

# Initialize CIDEr scorer
scorer = Cider()

# Compute CIDEr score
score, detailed_scores = scorer.compute_score(gts, res)

print("CIDEr Score:", score)  # Average CIDEr score
print("Detailed Scores:", detailed_scores)  # Per-sample scores

scorer = Cider()
score, detailed_scores = scorer.compute_score(evalRefs, evalHyps)
print("CIDEr Score:", score)


CIDEr Score: 0.023720091829294243
Detailed Scores: [0.00000000e+00 0.00000000e+00 6.32192661e-05 5.23238530e-05
 8.05818032e-05 0.00000000e+00 3.88520157e-05 0.00000000e+00
 5.93645003e-02 4.96127766e-01 5.01823233e-05 1.20623117e-06
 0.00000000e+00 7.99413526e-03 5.07286322e-09 0.00000000e+00
 0.00000000e+00 9.64600757e-09 1.51171395e-03 7.77154223e-06
 1.72081239e-03 1.39495444e-02 6.47087590e-05 1.16434181e-03
 1.17052897e-08 8.75847534e-04 0.00000000e+00 3.88520157e-05
 6.34380824e-04 1.50955701e-07 0.00000000e+00 0.00000000e+00
 5.95692137e-05 1.08866593e-10 1.69502092e-02 0.00000000e+00
 5.93645003e-02 0.00000000e+00 8.31819418e-06 3.27386336e-01
 2.57288028e-05 3.31297196e-15 2.50880636e-05 0.00000000e+00
 0.00000000e+00 2.49080133e-02 0.00000000e+00 1.04516142e-06
 4.51113166e-04 0.00000000e+00 1.74610936e-02 2.33294117e-02
 7.90068188e-05 9.54342584e-07 2.65577004e-21 0.00000000e+00
 2.31736432e-14 3.87880943e-03 7.76381966e-06 1.45739257e-04
 1.38748457e-03 4.32461606e-01 4.3

AttributeError: 'list' object has no attribute 'keys'