# Just need for Colab

In [None]:
!unzip /content/drive/MyDrive/roco-dataset.zip

In [None]:
!pip install git+https://github.com/huggingface/transformers.git@main
!pip install bert-score

In [None]:
import pandas as pd
df = pd.read_csv("/content/all_data/train/radiologytraindata.csv")
df

In [None]:
train_df = (
    pd.read_csv(
        "/content/all_data/train/radiologytraindata.csv",
        index_col="name",
    )
).filter(items=existing_train_img_names, axis=0)

In [None]:
train_img_names = train_df.index.values.tolist()
len(train_img_names)

65420

In [None]:
train_img_names

In [None]:
train_df

In [None]:
sample = True
train_limit = 200
if sample:
    train_df = df.sample(train_limit)

In [None]:
df

In [None]:
import os
existing_train_img_names = []
for root, dirs, files in os.walk('/content/all_data/train/radiology', topdown=False):
    for name in files:
        existing_train_img_names.append(name)

In [None]:
existing_train_img_names

In [None]:
len(train_df)

200

#Main

In [None]:
from torch.utils.data import Dataset, DataLoader
import glob
import os
from PIL import Image
import pandas as pd
import torch
import time
import json
from bert_score import score
from transformers import AutoProcessor, BlipForConditionalGeneration
from torchtext.data.metrics import bleu_score
from torchmetrics.text.rouge import ROUGEScore

device = "cuda" if torch.cuda.is_available() else "cpu"

"""save execution time"""
start_time = time.time()

"""necessary paths"""
path = '/content/all_data/'
train_path = path + 'train'
test_path = path + 'test'

each_n_epoch = 2
version = 1
im_size = (224, 224)
epochs = 5
should_save_weights = True

"""create necessary folders"""
model_path = path + "runs/" + str(version)
os.makedirs(model_path + "/results", exist_ok=True)
os.makedirs(model_path + "/weights", exist_ok=True)

"""define limit for data loading"""
sample = True
train_limit = 3000
test_limit = 500

dataset_batchsize = 4

train_img_path = train_path + '/radiology/images'
test_img_path = test_path + '/radiology/images'

existing_train_img_names = []
for root, dirs, files in os.walk(train_img_path, topdown=False):
    for name in files:
        existing_train_img_names.append(name)

existing_test_img_names = []
for root, dirs, files in os.walk(test_img_path, topdown=False):
    for name in files:
        existing_test_img_names.append(name)

"""read csv files"""
train_df = (
    pd.read_csv(
        train_path + "/radiologytraindata.csv",
        index_col="name",
    )
).filter(items=existing_train_img_names, axis=0)

if sample:
    train_df = train_df.sample(train_limit).sort_values(by=["name"])

print(f"- read {len(train_df)} train images")

test_df = (
    pd.read_csv(
        test_path + "/radiologytestdata.csv",
        index_col="name",
    )
).filter(items=existing_test_img_names, axis=0)

if sample:
    test_df = test_df.sample(test_limit).sort_values(by=["name"])

print(f"- read {len(test_df)} test images")

"""Save image names as list"""
train_img_names = train_df.index.values.tolist()
test_img_names = test_df.index.values.tolist()

class ImageCaptioningDataset(Dataset):
    def __init__(self, folder_path, image_list, dataset_df, processor):
        self.folder_path = folder_path
        self.image_list = image_list
        self.processor = processor
        self.dataset_df = dataset_df

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, idx):
        img_name = self.image_list[idx]
        img_path = self.folder_path + "/" + img_name
        img = Image.open(img_path)
        img = img.convert('RGB')
        img = img.resize(im_size)
        caption = self.dataset_df.iloc[idx]["caption"]
        encoding = self.processor(images=img, text=caption, padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        encoding['input_ids'] = encoding['input_ids'][:512]
        encoding['attention_mask'] = encoding['attention_mask'][:512]
        return encoding

"""Load processors and models"""
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

"""Generate Dataset and Dataloaders"""
train_dataset = ImageCaptioningDataset(folder_path=train_img_path, image_list=train_img_names, dataset_df=train_df, processor=processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=dataset_batchsize)

"""Caculating Metrics - BLEU-4, ROUGE, BERTScore"""
def calculate_bleu(preds, refs):
    preds_split = [pred.split() for pred in preds]
    refs_split = [[ref.split()] for ref in refs]
    score = bleu_score(preds_split, refs_split)
    return {'belu-4': score}

def calculate_rouge(preds, refs):
    rouge = ROUGEScore()
    refs_list = [[ref] for ref in refs]
    scores = rouge(preds, refs_list)
    return scores

def calculate_bertscore(preds, refs):
    count = len(preds)
    P, R, F1 = score(preds, refs, lang='en', verbose=False)
    score_F1 = (torch.sum(F1) / count).item()
    score_P = (torch.sum(P) / count).item()
    score_R = (torch.sum(R) / count).item()
    return {'bert-P': score_P, 'bert-R': score_R, 'bert-F1': score_F1}


"""Predicting"""
def pred_img_caption(image):
    inputs = processor(images=image, return_tensors="pt").to(device)
    pixel_values = inputs.pixel_values

    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_caption

def predict_and_save(model, epoch):
    """pred test dataset"""
    preds = []
    for name in test_img_names:
        im_path = test_img_path + "/" + name
        img = Image.open(im_path)
        cap = pred_img_caption(img)
        preds.append(cap)

    filename = f"e-{epoch}-test-caption-v{version}.csv"
    df = pd.DataFrame(
        data={
            "ID": test_img_names,
            "caption": preds,
        }
    )

    df.to_csv(
        model_path + "/results/" + filename, index=False, header=None
    )

    """compute scores for test data"""
    refs = test_df['caption'].tolist()
    bleu_score = calculate_bleu(preds, refs)
    rouge_score = calculate_rouge(preds, refs)
    bert_score = calculate_bertscore(preds, refs)
    all_scores = {**bleu_score, **rouge_score, **bert_score}

    filename = f"e-{epoch}-test-scores-v{version}.csv"
    score_df = pd.DataFrame(data=all_scores)

    score_df.to_csv(
        model_path + "/results/" + filename, index=False, header=None
    )

"""Training"""
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.to(device)

model.train()

for epoch in range(epochs):
  print("Epoch:", epoch+1)
  train_loss = 0.0
  batch_num = 1
  for idx, batch in enumerate(train_dataloader):
    print("Batch: ", str(batch_num))
    batch_num +=1
    torch.cuda.empty_cache()
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)

    loss = outputs.loss
    train_loss += loss.item() * input_ids.size(0)

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

  # Compute the average loss for the training dataset
  train_loss /= len(train_dataloader.dataset)

  # Print the training and validation losses for each epoch
  print('Epoch [{}/{}], Train Loss: {:.4f}'
        .format(epoch + 1, epochs, train_loss))

  if epoch == 0 or (epoch + 1) % each_n_epoch == 0:
      model.eval()
      predict_and_save(model, epoch + 1)
      os.makedirs(model_path + "/weights/epoch" + str(epoch+1) , exist_ok=True)
      model.save_pretrained(model_path + "/weights/epoch" + str(epoch+1))




with open(model_path + "/results/running_time.json", "w+") as f:
    json.dump({
        "task": "caption",
        "total_running_time_mins": (time.time() - start_time) / 60,
        "version": version,
    }, f)


