# ViT GPT2 - Finetune on portuguese dataset

Based on: https://ankur3107.github.io/blogs/the-illustrated-image-captioning-using-transformers/

## Imports

In [1]:
!pip install 'transformers[torch]' datasets evaluate nltk numpy pandas Pillow
!pip install accelerate -U

[31mERROR: Invalid requirement: 'accelerate -U'[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import json
from PIL import Image
import datasets
import pandas as pd
import evaluate
import nltk
import numpy as np
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/mariaeas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Setting models

In [2]:
image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decoder_model = "pierreguillou/gpt2-small-portuguese"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(image_encoder_model, text_decoder_model)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at pierreguillou/gpt2-small-portuguese and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_pro

In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
tokenizer = AutoTokenizer.from_pretrained(text_decoder_model)



In [4]:
tokenizer.pad_token = tokenizer.eos_token
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [5]:
output_dir = "vit-gpt-portuguese-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



('vit-gpt-portuguese-model/tokenizer_config.json',
 'vit-gpt-portuguese-model/special_tokens_map.json',
 'vit-gpt-portuguese-model/vocab.json',
 'vit-gpt-portuguese-model/merges.txt',
 'vit-gpt-portuguese-model/added_tokens.json',
 'vit-gpt-portuguese-model/tokenizer.json')

# Data split

In [6]:
with open("pracegover_dataset.json", "r") as file:
    dataset = json.load(file)

In [7]:
def to_coco_format(dataset, split):
    data = dataset['images']
    
    final_data = []
    max_len = 0
    for e in data:
        if e['split'] != split:
            continue
        
        if len(e['sentences'][0]['tokens']) > max_len:
            max_len = len(e['sentences'][0]['tokens'])

        d = {
                'image_id': e['sentences'][0]['imgid'],
                'caption_id': e['sentences'][0]['sentid'],
                'caption': e['sentences'][0]['raw'],
                'height': 100,
                'width': 100,
                'file_name': e['filename'],
                'coco_url': None,
                'image_path': f"images/63k-subset/{e['filename']}"
            }
        
        final_data.append(d)

    return final_data, max_len

In [8]:
train_data, train_max_len = to_coco_format(dataset, "train")
val_data, val_max_len = to_coco_format(dataset, "val")
test_data, test_max_len = to_coco_format(dataset, "test")

train_data = train_data[:int(len(train_data)*0.05)] 
val_data = val_data[:int(len(val_data)*0.05)] 
test_data = test_data[:int(len(test_data)*0.05)] 

In [9]:
max_len = max(train_max_len, val_max_len, test_max_len)
max_len

443

In [10]:
len(train_data), len(val_data), len(test_data)

(1894, 622, 630)

In [11]:
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=train_data))
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=val_data))
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=test_data))

# Preprocessing

In [12]:
def tokenization_fn(captions, max_target_length):
    labels = tokenizer(captions, 
                      padding="max_length", 
                      max_length=max_target_length).input_ids

    return labels


def feature_extraction_fn(image_paths, check_image=True):
    model_inputs = {}

    if check_image:
        images = []
        to_keep = []
        for image_file in image_paths:
            try:
                img = Image.open(image_file)
                images.append(img)
                to_keep.append(True)
            except Exception:
                to_keep.append(False)
    else:
        images = [Image.open(image_file) for image_file in image_paths]

    encoder_inputs = feature_extractor(images=images, return_tensors="np")

    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image = True):
    image_paths = examples['image_path']
    captions = examples['caption']    
    
    model_inputs = {}
    model_inputs['labels'] = tokenization_fn(captions, max_target_length)
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

    return model_inputs

In [13]:
ds = datasets.DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset
})

In [14]:
processed_dataset = ds.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": max_len},
    remove_columns=ds['train'].column_names
)

Map: 100%|██████████| 1894/1894 [02:12<00:00, 14.25 examples/s]
Map: 100%|██████████| 630/630 [00:53<00:00, 11.73 examples/s]
Map: 100%|██████████| 622/622 [00:30<00:00, 20.70 examples/s]


Map:  53%|█████▎    | 1000/1894 [01:10<00:46, 19.26 examples/s]

## Training the model

In [15]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)

In [16]:
metric = evaluate.load("rouge")

In [17]:
ignore_pad_token_for_loss = True


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred, language="portuguese")) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label, language="portuguese")) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                     decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [18]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['validation'],
    data_collator=default_data_collator,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
trainer.train()
trainer.save_model("./image-captioning-output")
tokenizer.save_pretrained("./image-captioning-output")

  0%|          | 0/1422 [00:00<?, ?it/s]