<a href="https://colab.research.google.com/github/MattBoraske/Reddit_AITA_Finetuning/blob/main/fine-tuning/flanT5_reddit_AITA_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Flan-T5 Reddit AITA Finetuning

## Configure Training

In [None]:
%pip install transformers accelerate torch evaluate datasets rouge_score peft bitsandbytes tensorboard py7zr

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

In [None]:
# set to where you want to save model checkpoints and logs
%cd /content/drive/MyDrive/AITA_FINETUNED_MODELS

In [4]:
############################
## FINE-TUNING PARAMETERS ##
############################

BASE_MODEL_NAME = "google/flan-t5-xxl"
FINETUNED_MODEL_NAME = "flan-t5-xxl-reddit-AITA-top-2500"
FINETUNING_DATASET = "MattBoraske/reddit-AITA-submissions-and-comments-top-2500"

## Run Training

### Loading of PEFT Model

In [5]:
import torch

def get_model_memory_size(model):
    total_size = 0
    for param in model.parameters():
        # param.nelement() gives the total number of elements in the tensor,
        # param.element_size() gives the size in bytes of each element in the tensor.
        total_size += param.nelement() * param.element_size()
    return total_size

def get_model_memory_size_gb(model):
    total_size_bytes = get_model_memory_size(model)
    total_size_gb = total_size_bytes / (1024 ** 3)  # Convert bytes to gigabytes
    return total_size_gb

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
import torch

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

quant_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSeq2SeqLM.from_pretrained(
  BASE_MODEL_NAME,
  device_map="auto",
  quantization_config=quant_config,
  torch_dtype=torch.bfloat16
)

print(f"Base Model memory size: {get_model_memory_size_gb(model)} GB")

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
peft_model.to("cuda")

print(f"PEFT Model memory size: {get_model_memory_size_gb(peft_model)} GB")

### Preparation of Model Inputs

In [9]:
FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE = 1024
FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE = 256

def preprocess_function(sample):

    # tokenize inputs
    model_inputs = tokenizer(sample['flanT5_instruction'], max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, padding='max_length', truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["top_comment_1"], max_length=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from datasets import load_dataset

dataset = load_dataset(FINETUNING_DATASET)
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["submission_title", "submission_text", "submission_score", "submission_url", "submission_date", "top_comment_1", 'top_comment_2', 'top_comment_3', 'top_comment_4', 'top_comment_5', 'top_comment_6', 'top_comment_7', 'top_comment_8', 'top_comment_9', 'top_comment_10', 'top_comment_1_classification', 'top_comment_2_classification', 'top_comment_3_classification', 'top_comment_4_classification', 'top_comment_5_classification', 'top_comment_6_classification', 'top_comment_7_classification', 'top_comment_8_classification', 'top_comment_9_classification', 'top_comment_10_classification', 'ambiguity_score', 'AITA_decision', 'flanT5_instruction', 'llama2_instruction'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

### Train PEFT Model

In [11]:
from transformers import DataCollatorForSeq2Seq

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=0, # ignore tokenizer pad token in the loss
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir=FINETUNED_MODEL_NAME

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=5e-4, # from FLAN-T5 paper - https://arxiv.org/pdf/2210.11416.pdf
    num_train_epochs=2,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_first_step=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
)

In [None]:
# train model
trainer.train()
peft_model.push_to_hub(f"MattBoraske/{FINETUNED_MODEL_NAME}")