In [1]:
from transformers import (
    AutoFeatureExtractor, 
    AutoTokenizer, 
    VisionEncoderDecoderModel,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer, 
    default_data_collator,
)

from torch.utils.data import Dataset

import pandas as pd
from sklearn.model_selection import train_test_split

from pathlib import Path
from PIL import Image

2024-03-01 06:33:55.849951: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-01 06:33:55.850006: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-01 06:33:55.851443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [2]:
df2 = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')
df1 = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv')

In [3]:
images_captions_df = pd.DataFrame({'imgs': [],
                                    'captions': []})
for i in range(len(df2)):
    uid = df2.iloc[i]['uid']
    image = df2.iloc[i]['filename']
    index = df1.loc[df1['uid'] ==uid]
    
    if not index.empty:    
        index = index.index[0]
        caption = df1.iloc[index]['findings']
        if type(caption) == float:
         
            continue 
        images_captions_df = pd.concat([images_captions_df, pd.DataFrame([{'imgs': image, 'captions': caption}])], ignore_index=True)
images_captions_df.head()

Unnamed: 0,imgs,captions
0,1_IM-0001-4001.dcm.png,The cardiac silhouette and mediastinum size ar...
1,1_IM-0001-3001.dcm.png,The cardiac silhouette and mediastinum size ar...
2,2_IM-0652-1001.dcm.png,Borderline cardiomegaly. Midline sternotomy XX...
3,2_IM-0652-2001.dcm.png,Borderline cardiomegaly. Midline sternotomy XX...
4,4_IM-2050-1001.dcm.png,There are diffuse bilateral interstitial and a...


In [6]:
encoder_checkpoint = "google/vit-base-patch16-224-in21k"
decoder_checkpoint = "ahmedabdo/facebook-bart-base-finetuned"

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

In [4]:
p = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized/'
images_captions_df['imgs'] = p+ images_captions_df['imgs']
images_captions_df.head()


Unnamed: 0,imgs,captions
0,/kaggle/input/chest-xrays-indiana-university/i...,The cardiac silhouette and mediastinum size ar...
1,/kaggle/input/chest-xrays-indiana-university/i...,The cardiac silhouette and mediastinum size ar...
2,/kaggle/input/chest-xrays-indiana-university/i...,Borderline cardiomegaly. Midline sternotomy XX...
3,/kaggle/input/chest-xrays-indiana-university/i...,Borderline cardiomegaly. Midline sternotomy XX...
4,/kaggle/input/chest-xrays-indiana-university/i...,There are diffuse bilateral interstitial and a...


In [7]:
# maximum length for the captions
max_length = 1024
sample = images_captions_df.iloc[99]

# sample image
image = Image.open(sample['imgs']).convert('RGB')
# sample caption
caption = sample['captions']

# apply feature extractor on the sample image
inputs = feature_extractor(images=image, return_tensors='pt')
# apply tokenizer
outputs = tokenizer(
            caption, 
            max_length=max_length, 
            
            padding='max_length',
            return_tensors='pt',
        )
print(len(outputs[0]))

1024


In [None]:
print(f"Inputs:\n{inputs}\nOutputs:\n{outputs}")

In [8]:
class LoadDataset(Dataset):
    def __init__(self, df):
        self.images = images_captions_df['imgs'].values
        self.captions = images_captions_df['captions'].values

    
    def __getitem__(self, idx):
        # everything to return is stored inside this dict
        inputs = dict()

        # load the image and apply feature_extractor
        image_path = str(self.images[idx])
        image = Image.open(image_path).convert("RGB")
        image = feature_extractor(images=image, return_tensors='pt')

        # load the caption and apply tokenizer
        caption = self.captions[idx]
        labels = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )['input_ids'][0]
        
        # store the inputs and labels in the dict we created
        inputs['pixel_values'] = image['pixel_values'].squeeze()   
        inputs['labels'] = labels
        return inputs
    
    def __len__(self):
        return len(self.images)

In [9]:
train_,test_df =train_test_split(images_captions_df, test_size=0.10, shuffle=True, random_state=42)

In [10]:
train_df,val_df =train_test_split(train_, test_size=0.10, shuffle=True, random_state=42)


In [11]:
print(len(train_df))
print(len(val_df))
print(len(test_df))


5239
583
647


In [12]:
train_ds = LoadDataset(train_df)
test_ds = LoadDataset(test_df)
val_ds = LoadDataset(val_df)

In [13]:
test_df.head()

Unnamed: 0,imgs,captions
2094,/kaggle/input/chest-xrays-indiana-university/i...,PA and lateral views the chest were obtained. ...
3658,/kaggle/input/chest-xrays-indiana-university/i...,Mild hypoventilation with bronchovascular crow...
4852,/kaggle/input/chest-xrays-indiana-university/i...,"The lungs are clear bilaterally. Specifically,..."
351,/kaggle/input/chest-xrays-indiana-university/i...,The heart is normal in size. The mediastinum i...
2166,/kaggle/input/chest-xrays-indiana-university/i...,No there is an dextroscoliosis of the thoracic...


In [None]:
next(iter(test_ds))
# next(iter(val_ds))

In [17]:
model = VisionEncoderDecoderModel.from_pretrained("ahmedabdo/facebook-bart-base-finetuned").to('cuda')
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# model.config.vocab_size = model.config.decoder.vocab_size
model.config.num_beams = 4

In [None]:
batch = next(iter(train_ds))
model(pixel_values=batch['pixel_values'].unsqueeze(0), labels=batch['labels'].unsqueeze(0))

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="image-caption-generator", 
    evaluation_strategy="epoch", 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,   
    learning_rate=5e-5,
    weight_decay=0.01,                  
    num_train_epochs=2,          
    save_strategy='epoch',               
    report_to='wandb', 
     logging_dir="./logs",
    logging_steps=10,
  
)

trainer = Seq2SeqTrainer(
    model=model, 
    tokenizer=feature_extractor, 
    data_collator=default_data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
    
)

In [None]:
trainer.train()

In [None]:
index = 150
img =  Image.open(images_captions_df['imgs'][index]).convert("RGB")
features = feature_extractor(img, return_tensors="pt").pixel_values.to("cuda")
caption = tokenizer.decode(model.generate(features,max_length = 2048)[0],skip_special_tokens=True)
print("predicted caption =====>",caption)
print("actual caption =====> ", images_captions_df['captions'][index])


In [15]:
# model.push_to_hub("ahmedabdo/facebook-bart-base-finetuned")
# tokenizer.push_to_hub("ahmedabdo/facebook-bart-base-finetuned")


Non-default generation parameters: {'num_beams': 4}


model.safetensors:   0%|          | 0.00/730M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ahmedabdo/facebook-bart-base-finetuned/commit/96c14af7374a3d133bfdd2cbe14ef81df0109352', commit_message='Upload model', commit_description='', oid='96c14af7374a3d133bfdd2cbe14ef81df0109352', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
import tqdm 
predicted_captions = [] 
for i in tqdm.tqdm( val_df['imgs']):
    img =  Image.open(i).convert("RGB")
    features = feature_extractor(img, return_tensors="pt").pixel_values.to("cuda")
    caption = tokenizer.decode(model.generate(features,max_length = 1024)[0],skip_special_tokens=True)
    predicted_captions.append(caption)
print(len(predicted_captions))


100%|██████████| 583/583 [04:47<00:00,  2.03it/s]

583





In [21]:
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Assuming you have a list of predicted captions and a list of ground truth captions
generated_captions = predicted_captions
ground_truth_captions = val_df['captions'].values
# Convert the caption lists into the format expected by nltk
ground_truth_captions = [[caption.split() for caption in captions] for captions in ground_truth_captions]
generated_captions = [caption.split() for caption in generated_captions]


# Define the smoothing function to use
smoothie = SmoothingFunction().method4

# Compute the BLEU score with smoothing
weights = (0.25, 0.25, 0.25, 0.25)  # equal weights for 1-4 gram BLEU scores
score = corpus_bleu(ground_truth_captions, predicted_captions,weights =weights)
print(f'The BELU Score Is: {score}')

The BELU Score Is: 0.6084233073777816
