## **Reference**
- https://github.com/google-research/t5x

In [None]:
!nvidia-smi

Fri Jul 26 08:53:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Installing the initial requirements

In [None]:
!pip install -qqq pytesseract transformers evaluate datasets rouge-score nltk tensorboard py7zr --upgrade --progress-bar off

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.
tensorflow 2.15.0 requires tensorboard<2.16,>=2.15, but you have tensorboard 2.17.0 which is incompatible.[0m[31m
[0m

In [None]:
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


# Loading the Dataset

In [None]:
import pandas as pd
from datasets import Dataset

# Load the CSV file using pandas
train_excel_file = pd.read_csv('train_data.csv')
val_excel_file = pd.read_csv('val_data.csv')

# Convert the pandas DataFrame to a datasets Dataset
train_dataset = Dataset.from_pandas(train_excel_file)
val_dataset = Dataset.from_pandas(val_excel_file)

# Now you have two separate datasets for training and validation
print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

Training dataset size: 4164
Validation dataset size: 200


# Fine Tuning FLAN-T5

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

# Move model to GPU
model.to(device)
model.gradient_checkpointing_enable()

# Optionally, check GPU memory usage
print(torch.cuda.memory_allocated(device) / 1024**2, "MB")
print(torch.cuda.memory_reserved(device) / 1024**2, "MB")

GPU is available. Using Tesla T4
947.4345703125 MB
1014.0 MB


## Tokenizing the dataset

In [None]:
from datasets import concatenate_datasets

tokenized_inputs = concatenate_datasets([train_dataset, val_dataset]).map(lambda x: tokenizer(x["Judgement"], truncation=True), batched=True, remove_columns=["judgement", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([train_dataset, val_dataset]).map(lambda x: tokenizer(x["Perspective-based Summary"], truncation=True), batched=True, remove_columns=["judgement", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/4364 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/4364 [00:00<?, ? examples/s]

Max target length: 512


In [None]:
from random import randrange

sample = train_dataset[randrange(len(train_dataset))]


def preprocess_function(sample,padding="max_length"):
    inputs = ["summarize: " + item for item in sample["Judgement"]]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["Perspective-based Summary"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["Judgement", "Perspective-based Summary"])
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=["Judgement", "Perspective-based Summary"])

Map:   0%|          | 0/4164 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
print(f"Keys of tokenized dataset: {list(tokenized_train_dataset.features)}")

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## Setting up Training Arguments

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

repository_id = "username/model_name"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{repository_id}",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=True,
    learning_rate=5e-5,
    num_train_epochs=3,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)



## Training the model

In [None]:
# Start training
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1563, training_loss=0.0, metrics={'train_runtime': 2618.5526, 'train_samples_per_second': 4.771, 'train_steps_per_second': 0.597, 'total_flos': 8553988846780416.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub(repository_id)