# **Training a vit-gpt2-image-captioning Model using Hugging Face Transformers**

### **1. Setting Up the Environment**

In [None]:
# Install necessary libraries
!pip install transformers datasets torch torchvision pandas

# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1

### **2. Loading the Data**

In [None]:
import os
import pandas as pd
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split

# Set the path to your images in Google Drive
image_dir = '/content/drive/MyDrive/Alzahraa Hospital Chest  pictures'  # Update with the correct folder name

# Load the Excel file containing the image metadata and reports
file_path = '/content/reportt.xlsx'  # Update this with the correct path
data = pd.read_excel(file_path)

# Display the first few rows to verify the data is loaded correctly
print(data.head())

                                          Patient ID Accession Number  \
0  1.3.51.0.7.13031837681.17781.37192.46305.60232...         22I12576   
1  1.3.51.0.7.11215356896.1180.21838.43541.61959....           23E493   
2  1.3.51.0.7.3534171348.37436.12098.39027.22592....           23E594   
3  1.3.51.0.7.1320323117.31878.7242.37241.35655.1...           23E595   
4  1.3.51.0.7.14256674585.45540.35147.40048.59820...           23E615   

  Study Date gender                                            Reports  \
0 2023-01-25      m  canule trachéale en place.\ninfiltrats interst...   
1 2023-01-20      f  Observation:\nprotheses humerales bilateral.\n...   
2 2023-01-21      m  assymetrie des surfaces pulmonaires.\ninfiltra...   
3 2023-01-21      m  introduction de ST pour detress .\nsurfaces sy...   
4 2023-01-22      m  assymetrie des surfaces pulmonaires.\naorte de...   

                                        Main Reports  \
0  canule trachéale en place. infiltrats intersti...   
1  O

### **3. Defining the Custom Dataset Class**

In [None]:
# Define a custom PyTorch Dataset class to handle image and report data
class ChestXrayDataset(Dataset):
    def __init__(self, data, image_dir, transform=None):
        self.data = data  # DataFrame containing image metadata and reports
        self.image_dir = image_dir  # Directory where images are stored
        self.transform = transform  # Optional transformations for images

    def __len__(self):
        return len(self.data)  # Returns the number of samples in the dataset

    def __getitem__(self, idx):
        # Generate the full image path using the Accession Number
        img_name = os.path.join(self.image_dir, str(self.data.iloc[idx, 1]) + '.jpg')  # Assuming Accession Number is in the second column
        image = Image.open(img_name).convert('RGB')  # Open the image and convert to RGB

        if self.transform:
            image = self.transform(image)  # Apply transformations if any

        # Retrieve the corresponding report (assuming English Reports is in the last column)
        report = self.data.iloc[idx, -1]

        return image, report  # Return image and report as a tuple

### **4. Image Preprocessing and Dataset Preparation**

In [None]:
# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor()  # Convert images to PyTorch tensors
])

# Initialize the dataset with the defined transformations
dataset = ChestXrayDataset(data=data, image_dir=image_dir, transform=transform)

# Define the train and validation split (80% training, 20% validation)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Initialize DataLoaders for both the training and validation datasets
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)


### **5. Loading the Pre-trained Model and Tokenizer**

In [None]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

# Load pre-trained VisionEncoderDecoder model (e.g., Google/VIT-GPT2 model)
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Load tokenizer and feature extractor for the model
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = AutoFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning", do_rescale=False)

# Freeze the encoder to prevent its weights from being updated during training
for param in model.encoder.parameters():
    param.requires_grad = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



### **6. Defining Training Arguments and Collate Function**

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments for Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Directory to save training results
    per_device_train_batch_size=2,  # Batch size per device during training
    per_device_eval_batch_size=2,  # Batch size per device during evaluation
    num_train_epochs=3,  # Number of training epochs
    logging_dir="./logs",  # Directory to save logs
    logging_steps=10,  # Log every 10 steps
    save_steps=10,  # Save model every 10 steps
    evaluation_strategy="steps",  # Evaluate model after a certain number of steps
    save_total_limit=2,  # Limit the number of saved model checkpoints
    remove_unused_columns=False,  # Keep all columns in the data
    fp16=True,  # Use mixed precision training for faster training
    gradient_accumulation_steps=4  # Accumulate gradients over 4 steps
)

# Define a collate function to process batches of data
def collate_fn(batch):
    images, reports = zip(*batch)  # Unpack the batch into images and reports
    pixel_values = feature_extractor(images, return_tensors="pt").pixel_values  # Extract features from images
    reports = [str(report) for report in reports]  # Ensure all reports are strings
    inputs = tokenizer(reports, padding="max_length", truncation=True, return_tensors="pt")  # Tokenize the reports
    input_ids = inputs.input_ids  # Extract input IDs from the tokenizer output
    return {"pixel_values": pixel_values, "labels": input_ids}  # Return pixel values and labels



### **7. Initializing and Training the Model**

In [None]:
# Initialize Seq2SeqTrainer with the model, training arguments, datasets, and collate function
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,  # Use custom collate function for data processing
)

# Start training the model
trainer.train()


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss
10,2.1397,0.152108
20,0.1555,0.111942
30,0.1159,0.092169
40,0.0965,0.081771
50,0.0911,0.074713
60,0.0806,0.070203
70,0.0697,0.06659


Step,Training Loss,Validation Loss
10,2.1397,0.152108
20,0.1555,0.111942
30,0.1159,0.092169
40,0.0965,0.081771
50,0.0911,0.074713
60,0.0806,0.070203
70,0.0697,0.06659
80,0.0716,0.063174
90,0.0693,0.060711
100,0.0652,0.059461


TrainOutput(global_step=363, training_loss=0.12031747737370903, metrics={'train_runtime': 4849.9428, 'train_samples_per_second': 0.602, 'train_steps_per_second': 0.075, 'total_flos': 5.240671398128517e+17, 'train_loss': 0.12031747737370903, 'epoch': 2.9815195071868583})

### **8. Evaluating and Saving the Model**

In [None]:
# Evaluate the model on the validation dataset
results = trainer.evaluate()
print(results)  # Print the evaluation results

# Save the trained model to Google Drive
model.save_pretrained("/content/drive/MyDrive/saved_modelll")


{'eval_loss': 0.047488681972026825, 'eval_runtime': 54.2679, 'eval_samples_per_second': 4.496, 'eval_steps_per_second': 2.248, 'epoch': 2.9815195071868583}


### **9. Loading and Testing the Trained Model**

In [None]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor
from PIL import Image
import torch

# Load the trained model and tokenizer from Google Drive
model = VisionEncoderDecoderModel.from_pretrained("/content/drive/MyDrive/saved_modelll")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = AutoFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning", do_rescale=False)

# Set the model to evaluation mode
model.eval()

# Define the image preprocessing function
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image = feature_extractor(images=image, return_tensors="pt").pixel_values
    return image

# Define the function to generate a caption for an image
def generate_caption(image_path):
    pixel_values = preprocess_image(image_path)  # Preprocess the image
    pixel_values = pixel_values.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))  # Move to GPU/CPU
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    with torch.no_grad():  # Disable gradient computation for inference
        output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)  # Generate caption

    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)  # Decode the generated caption
    return caption

# Test the model with a new image
image_path = '/content/drive/MyDrive/Alzahraa Hospital Chest  pictures/22E7326.jpg'  # Update with your test image path
caption = generate_caption(image_path)  # Generate a caption for the test image
print("Generated Caption:", caption)  # Print the generated caption


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Caption: The heart is normal in size and shape. Lungs are clear showing no nodular or infiltrative changes. Hila and mediastinum are normal. The pleural space is free, rib cage is intact, rib cage is intact,


# SAME REPORT IS GENERATED FOR ANY IMAGE