## **Required Installation**

In [1]:
!rm -rf /kaggle/working/*

In [33]:
!pip install transformers evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


## **Necessary Imports**

In [3]:
import os
import torch
import numpy as np
import pandas as pd
from PIL import Image
from torch.optim import AdamW
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from torchvision.transforms import v2
from torch.utils.data import DataLoader
from transformers import EvalPrediction
from distutils.dir_util import copy_tree
from transformers import GPT2TokenizerFast
from transformers import ViTImageProcessor
from torch.utils.tensorboard import SummaryWriter
from transformers import VisionEncoderDecoderModel
from sklearn.model_selection import train_test_split

2024-03-11 10:40:24.320269: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 10:40:24.320362: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 10:40:24.480237: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## **Initalization of the directory**

In [4]:
%cd /kaggle/input/artwork-title-generator

/kaggle/input/artwork-title-generator


## **Fetching sub-folders**

In [5]:
img_folders = os.listdir()
img_folders.remove('csvFiles')
img_folders.remove('cloud_mask')
img_folders.remove('artwork_title_generator_data.csv')
img_folders.remove('artwork_title_generator_data_noise_removed.csv')
img_folders

['Italy',
 'Japan',
 'UnitedStates',
 'France',
 'Netherlands',
 'China',
 'Germany',
 'England']

## **Copying files to the working directory**

In [6]:
input_path = "/kaggle/input/artwork-title-generator"
output_path = "/kaggle/working/images/"

for folder in img_folders:
    copy_tree(input_path+'/'+folder, output_path+folder)

## **Reading data file**

In [7]:
df = pd.read_csv("artwork_title_generator_data_noise_removed.csv")
df = df.drop(columns=['ids'], axis=1)

## **Viewing dataframe**

In [8]:
df.head()

Unnamed: 0,titles,images_path
0,starry night astronauts,/kaggle/working/images/UnitedStates/UnitedStat...
1,mansions,/kaggle/working/images/UnitedStates/UnitedStat...
2,city landscape,/kaggle/working/images/UnitedStates/UnitedStat...
3,hero construction,/kaggle/working/images/UnitedStates/UnitedStat...
4,nighthawks,/kaggle/working/images/UnitedStates/UnitedStat...


## **Dataframe shape**

In [9]:
df.shape

(33222, 2)

In [10]:
%cd /kaggle/working/

/kaggle/working


## **Model**

In [11]:
encoder_model = 'microsoft/swin-base-patch4-window7-224-in22k'
decoder_model = 'gpt2'

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model, decoder_model
)

config.json:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## **Tokenizer**

In [12]:
tokenizer = GPT2TokenizerFast.from_pretrained(decoder_model)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
if "gpt2" in decoder_model:
  # gpt2 does not have decoder_start_token_id and pad_token_id
  # but has bos_token_id and eos_token_id
  tokenizer.pad_token = tokenizer.eos_token # pad_token_id as eos_token_id
  model.config.eos_token_id = tokenizer.eos_token_id
  model.config.pad_token_id = tokenizer.pad_token_id
  # set decoder_start_token_id as bos_token_id
  model.config.decoder_start_token_id = tokenizer.bos_token_id
else:
  # set the decoder start token id to the CLS token id of the tokenizer
  model.config.decoder_start_token_id = tokenizer.cls_token_id
  # set the pad token id to the pad token id of the tokenizer
  model.config.pad_token_id = tokenizer.pad_token_id

## **Processor**

In [14]:
processor = ViTImageProcessor.from_pretrained(encoder_model)

preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

## **Device: cuda or cpu**

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## **Custom Dataset Class**

In [16]:
class ArtworkTitleGeneratorDataset(Dataset):
  def __init__(self, df):
    self.df = df.sample(frac = 1, random_state = 42)

  def __len__(self):
    return len(self.df)

  def __preprocess__(self, image_path):
    image = Image.open(image_path)
    transforms = v2.Compose([
        v2.Resize(size=(224,224)),
        v2.ToDtype(torch.float32, scale = True)
    ])
    image = transforms(image)
    img = processor(image, return_tensors = 'pt')
    return img

  def __getitem__(self, index):
    image_path = self.df.iloc[index]['images_path']
    image = self.__preprocess__(image_path)
    title = self.df.iloc[index]['titles']
    targets = tokenizer(
        title,
        max_length = 32,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt'
    )
    
    return {
        'pixel_values': image['pixel_values'].squeeze(),
        'texts': targets['input_ids'].squeeze()
    }

## **Creating dataset**

In [17]:
dataset = ArtworkTitleGeneratorDataset(df)

## **Viewing a sample data**

In [18]:
dataset[2500]

{'pixel_values': tensor([[[-1.8782, -1.8097, -1.8268,  ..., -1.8268, -1.7754, -1.8439],
          [-1.8439, -1.7412, -1.4329,  ..., -1.8097, -1.8268, -1.8268],
          [-1.8782, -1.6213,  0.3309,  ..., -1.7754, -1.7583, -1.8268],
          ...,
          [-1.7069, -1.1932, -1.0562,  ...,  1.8893,  0.3481, -1.7412],
          [-1.8097, -1.7240, -1.7754,  ..., -0.8164, -1.1589, -1.7583],
          [-1.8782, -1.8782, -1.8610,  ..., -1.8268, -1.8097, -1.8268]],
 
         [[-1.8431, -1.7556, -1.8081,  ..., -1.7906, -1.7206, -1.7906],
          [-1.7906, -1.7906, -1.5805,  ..., -1.7556, -1.7731, -1.7906],
          [-1.7906, -1.6856,  0.2052,  ..., -1.7556, -1.7206, -1.7731],
          ...,
          [-1.7381, -1.2479, -1.1078,  ...,  2.0609,  0.4678, -1.6856],
          [-1.7556, -1.6506, -1.7206,  ..., -0.7402, -1.1253, -1.7381],
          [-1.7731, -1.7206, -1.7206,  ..., -1.7031, -1.7206, -1.7556]],
 
         [[-1.4733, -1.4384, -1.5081,  ..., -1.4907, -1.4210, -1.5081],
          [-

## **Splitting data**

In [19]:
train_ds, val_test_ds = train_test_split(dataset, train_size = 0.8, random_state = 42)
val_ds, test_ds = train_test_split(val_test_ds, train_size = 0.5, random_state = 42)

## **Viewing splitted data lengths**

In [20]:
len(train_ds), len(val_ds), len(test_ds) 

(26577, 3322, 3323)

## **Pre-defined Hyperparameters**

In [21]:
batch_size = 32
num_epochs = 5
learning_rate = 1e-4

optimizer = AdamW(
    model.parameters(),
    lr = learning_rate
  )

summary_writer = SummaryWriter(log_dir="tensorboard")

In [22]:
def collate_fn(batch):
    return{
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'texts': torch.stack([x['texts'] for x in batch]),
    }

## **Dataloader Creation**

In [23]:
train_loader = DataLoader(
    train_ds,
    collate_fn = collate_fn,
    batch_size = batch_size,
    shuffle = True
)

valid_loader = DataLoader(
    val_ds,
    collate_fn = collate_fn,
    batch_size = batch_size
)

test_loader = DataLoader(
    test_ds,
    collate_fn = collate_fn,
    batch_size = batch_size
)

In [24]:
torch.save(test_loader, 'test_loader.pkl')

## **Dataloader iteration**

In [25]:
batch = iter(train_loader)
first = next(batch)
first

{'pixel_values': tensor([[[[-1.7412, -1.6898, -1.7069,  ..., -1.6727, -1.7069, -1.7412],
           [-1.7240, -1.6898, -1.6898,  ..., -1.6384, -1.6555, -1.6898],
           [-1.7240, -1.6898, -1.7069,  ..., -1.6384, -1.6384, -1.6727],
           ...,
           [-1.7240, -1.7412, -1.7925,  ..., -1.7412, -1.7583, -1.7754],
           [-1.8097, -1.8097, -1.8782,  ..., -1.7240, -1.7240, -1.8097],
           [-1.8268, -1.8439, -1.8953,  ..., -1.7583, -1.7925, -1.8782]],
 
          [[-1.7031, -1.7031, -1.7381,  ..., -1.6856, -1.6856, -1.7031],
           [-1.6856, -1.7031, -1.7031,  ..., -1.6506, -1.6681, -1.6856],
           [-1.6856, -1.6856, -1.7031,  ..., -1.6681, -1.6856, -1.6856],
           ...,
           [-1.7556, -1.7381, -1.7731,  ..., -1.7906, -1.7731, -1.7731],
           [-1.8256, -1.7906, -1.8431,  ..., -1.7731, -1.7556, -1.7731],
           [-1.8256, -1.8081, -1.8431,  ..., -1.7731, -1.7906, -1.8431]],
 
          [[-0.7587, -0.7587, -0.7587,  ..., -0.6018, -0.6541, -0.6890

## **Training function**

In [27]:
def training(batch, model, device):
  pixel_values = batch['pixel_values'].to(device)
  text = batch['texts'].to(device)
  model.to(device)
  outputs = model(
      pixel_values = pixel_values,
      labels = text
  )
  loss = outputs.loss
  return outputs, loss

## **Validation function**

In [28]:
def validation(batch, model, device):
  pixel_values = batch['pixel_values'].to(device)
  targets = batch['texts'].to(device)
  model.eval()
  model.to(device)
  outputs = model(
      pixel_values = pixel_values,
      labels = targets
  )
  loss = outputs.loss
  return outputs, loss, targets

## **Saving checkpoint**

In [29]:
def save_checkpoint(state, checkpoint_path):
    torch.save(state, checkpoint_path)

## **Loading a saved model**

In [30]:
def load_checkpoint(model, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['state_dict'])
    return model

In [31]:
# model = load_checkpoint(model = model, checkpoint_path = '/kaggle/working/model_checkpoint.pt')
# # model.eval()
# model.state_dict()

In [37]:
!pip install rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=bf134c1ee801b978aa6ae00185c3110076a43bf493cd900f5538f7bec49934f2
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [40]:
import evaluate

rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

def compute_metrics(eval_pred):
    
    preds = eval_pred.predictions
    
    labels = eval_pred.label_ids
    
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens = True)
    
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens = True)    
    
    rouge_result = rouge.compute(predictions = pred_str, references = labels_str)
    
    rouge_result = {k: round(v*100, 4) for k, v in rouge_result.items()}
    
    bleu_result = bleu.compute(predictions = pred_str, references = labels_str)
    
    generation_length = bleu_result['translation_length']
    
    return {
        **rouge_result,
        'bleu' : round(bleu_result['bleu']*100, 4),
        'gen_len' : bleu_result['translation_length'] / len(preds)
    }
    

## **Training the model**

In [41]:
valid_min_loss = np.inf

for epoch in range(num_epochs):
    
    model.train()
    
    train_loss = 0.0
    valid_loss = 0.0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        
        outputs, loss = training(batch, model, device)
        
        loss.backward()
        
        optimizer.step()
        
        optimizer.zero_grad()
        
        train_loss += loss.item()
        
    predictions, labels = [], []
        
    model.eval()
    
    metrics = 0
        
    with torch.no_grad():

        for batch in valid_loader:

            outputs, loss, targets = validation(batch, model, device)

            valid_loss += loss.item()

            logits = outputs.logits.detach().cpu()

            predictions.extend(logits.argmax(dim=-1).tolist())

            labels.extend(targets.tolist())

        eval_prediction = EvalPrediction(predictions = predictions, label_ids = labels)
        
        metrics = compute_metrics(eval_prediction)
        
    train_loss /= len(train_loader)
    valid_loss /= len(valid_loader)

    if valid_loss < valid_min_loss:
        print('Saving checkpoint.......')

        checkpoint = {
            'epoch': epoch+1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'valid_min_loss': valid_loss
        }

        valid_min_loss = valid_loss
        save_checkpoint(checkpoint, checkpoint_path = f"/kaggle/working/model_checkpoint.pt")

    print(f"Epoch: {epoch+1}/{num_epochs}\t Train loss: {train_loss:.4f}\t Validation loss: {valid_loss:.4f}\t Bleu: {metrics['bleu']:.4f}\t "+
         f"ROUGE-1: {metrics['rouge1']:.4f}\t ROUGE-2: {metrics['rouge2']:.4f}\t ROUGE-L: {metrics['rougeL']:.4f}")
     
    summary_writer.add_scalar('valid_loss', valid_loss)
    summary_writer.add_scalar('bleu', metrics['bleu'])
    summary_writer.add_scalar('rouge1', metrics['rouge1'])
    summary_writer.add_scalar('rouge2', metrics['rouge2'])    
    summary_writer.add_scalar('rougeL', metrics['rougeL'])

Epoch 1/5 - Training:   0%|          | 0/831 [00:00<?, ?it/s]

Saving checkpoint.......
Epoch: 1/5	 Train loss: 0.9492	 Validation loss: 1.0880	 Bleu: 12.6386	 ROUGE-1: 15.3508	 ROUGE-2: 5.6895	 ROUGE-L: 15.2821


Epoch 2/5 - Training:   0%|          | 0/831 [00:00<?, ?it/s]

Saving checkpoint.......
Epoch: 2/5	 Train loss: 0.8162	 Validation loss: 1.0852	 Bleu: 14.2526	 ROUGE-1: 16.4423	 ROUGE-2: 6.4663	 ROUGE-L: 16.3186


Epoch 3/5 - Training:   0%|          | 0/831 [00:00<?, ?it/s]

Epoch: 3/5	 Train loss: 0.6909	 Validation loss: 1.1323	 Bleu: 15.3607	 ROUGE-1: 16.8316	 ROUGE-2: 6.9424	 ROUGE-L: 16.7022


Epoch 4/5 - Training:   0%|          | 0/831 [00:00<?, ?it/s]

Epoch: 4/5	 Train loss: 0.5665	 Validation loss: 1.2006	 Bleu: 15.7254	 ROUGE-1: 17.0397	 ROUGE-2: 7.1559	 ROUGE-L: 16.9022


Epoch 5/5 - Training:   0%|          | 0/831 [00:00<?, ?it/s]

Epoch: 5/5	 Train loss: 0.4551	 Validation loss: 1.3011	 Bleu: 16.0362	 ROUGE-1: 17.2015	 ROUGE-2: 7.2207	 ROUGE-L: 17.1058


In [42]:
torch.save(model, "generator_model.pkl")

In [43]:
from IPython.display import FileLink
FileLink('generator_model.pkl')

In [None]:
# m = torch.load('model_checkpoint.pt')
# m['state_dict']

from IPython.display import clear_output
clear_output()

In [None]:
model.load_state_dict(m['state_dict'])