<a href="https://colab.research.google.com/github/MattBoraske/Reddit_AITA_LLMs/blob/main/FLAN_T5_AITA_Peft_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformers accelerate torch evaluate datasets rouge_score peft bitsandbytes tensorboard py7zr

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manyli

In [None]:
# mount gdrive to save results
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning

/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning


In [None]:
from datasets import load_dataset

dataset = load_dataset("MattBoraske/AITA_subreddit_submissions_flanT5_filtered")

Downloading readme:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/88345 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9826 [00:00<?, ? examples/s]

In [None]:
FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE = 1024
FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE = 256

In [None]:
AITA_classifications = [
    "\nYou're The A**hole (YTA) when the first person (you, the writer of the conflict context) is causing the conflict.",
    "\nNot The A**hole (NTA) when a third person party (anyone but the writer of the conflict context) is causing the conflict.",
    "\nNo A**holes Here (NAH) when no parties are causing the conflict.",
    "\nEveryone Sucks Here (ESH) when all parties are causing the conflict.",
    "\nMore Information Needed (INFO) when a classification can not be classified using the conflict context."
]

INSTRUCTION_PREFIX = f"Classify the interpersonal conflict into one of the following categories and provide a justification for your choice. The categories are: {''.join(AITA_classifications)}\n\nConflict Context: "

In [None]:
def preprocess_function(sample):
    # add prefix to the input for t5
    inputs = [INSTRUCTION_PREFIX + item for item in sample["submission_text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, padding='max_length', truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["top_comment_1"], max_length=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["submission_title", "submission_text", "decision", "submission_score", "submission_url", "submission_date", "top_comment_1", 'top_comment_2', 'top_comment_3', 'top_comment_4', 'top_comment_5', 'top_comment_6', 'top_comment_7', 'top_comment_8', 'top_comment_9', 'top_comment_10', 'top_comment_1_classification', 'top_comment_2_classification', 'top_comment_3_classification', 'top_comment_4_classification', 'top_comment_5_classification', 'top_comment_6_classification', 'top_comment_7_classification', 'top_comment_8_classification', 'top_comment_9_classification', 'top_comment_10_classification', 'ambiguity_score'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/88345 [00:00<?, ? examples/s]

Map:   0%|          | 0/9826 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


### Load model and LORA config

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl", device_map = "auto", quantization_config=quant_config, torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
import torch

def get_model_memory_size(model):
    total_size = 0
    for param in model.parameters():
        # param.nelement() gives the total number of elements in the tensor,
        # param.element_size() gives the size in bytes of each element in the tensor.
        total_size += param.nelement() * param.element_size()
    return total_size

def get_model_memory_size_gb(model):
    total_size_bytes = get_model_memory_size(model)
    total_size_gb = total_size_bytes / (1024 ** 3)  # Convert bytes to gigabytes
    return total_size_gb

print(f"Model memory size: {get_model_memory_size_gb(model)} GB")

Model memory size: 12.116172790527344 GB


In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 4096)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 4096)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (k): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (o): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (relative_attention_bias): Embedding(32, 64)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=4096, out_features=10240, bias=False)
              (wi_1): Linear4bit(in_features=4096, out_features=

In [None]:
model.config

T5Config {
  "_name_or_path": "google/flan-t5-xxl",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 10240,
  "d_kv": 64,
  "d_model": 4096,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 24,
  "num_heads": 64,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method":

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
peft_model.to("cuda")



trainable params: 18,874,368 || all params: 11,154,206,720 || trainable%: 0.16921300163961817


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 4096)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 4096)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4096, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=4096, bias=False)
                    )
                    (lora_embedding_A): ParameterDict(

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = 0
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

MODEL_NAME = "flan-t5-xxl-AITA-peft-adapter"

output_dir=f"/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning/{MODEL_NAME}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
		auto_find_batch_size=True,
    learning_rate=5e-4, # from FLAN-T5 paper - https://arxiv.org/pdf/2210.11416.pdf
    num_train_epochs=2,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_first_step=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# train model
trainer.train()
peft_model.push_to_hub(f"MattBoraske/{MODEL_NAME}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,45.5732
10,26.9237
20,5.9179
30,1.3675
40,0.7463
50,0.6723
60,0.6436
70,0.6126
80,0.8172
90,0.8014


KeyboardInterrupt: 

In [None]:
peft_model.push_to_hub("MattBoraske/flan-t5-xxl-AITA-peft-adapter-400-samples")

adapter_model.safetensors:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MattBoraske/flan-t5-xxl-AITA-peft-adapter-400-samples/commit/81ff3446cc74df267e50858f4eacd94694a740d6', commit_message='Upload model', commit_description='', oid='81ff3446cc74df267e50858f4eacd94694a740d6', pr_url=None, pr_revision=None, pr_num=None)

## Testing

In [None]:
peft_model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 4096)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 4096)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4096, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=4096, bias=False)
                    )
                    (lora_embedding_A): ParameterDict(

In [None]:
INSTRUCTION_PREFIX

"Classify the interpersonal conflict into one of the following categories and provide a justification for your choice. The categories are: \nYou're The A**hole (YTA) when the first person (you, the writer of the conflict context) is causing the conflict.\nNot The A**hole (NTA) when a third person party (anyone but the writer of the conflict context) is causing the conflict.\nNo A**holes Here (NAH) when no parties are causing the conflict.\nEveryone Sucks Here (ESH) when all parties are causing the conflict.\nMore Information Needed (INFO) when a classification can not be classified using the conflict context.\n\nConflict Context: "

### Single sample test

In [None]:
from random import randrange
sample = dataset['test'][randrange(len(dataset["test"]))]
sample

{'submission_title': "AITA for refusing to have anything to do with my sister's wedding bc of who her fiancé was as a teenager?",
 'submission_text': 'i\'ll keep this as short as i can my [28m] twin sister [28f] recently got engaged to a guy [30m] who i\'ll call terry. in secondary school terry was a year above us and he and his friends bullied me *relentlessly*. i\'ll spare you the finer details, but i\'m talking "caused me lifelong psychological issues" type of shit. granted it was never physical but it was a literal daily onslaught of psychological and verbal abuse, to the point where i bunked off probably half of years 9 and 10 just so i could avoid it. i\'m not a hateful or spiteful person, but i absolutely despise terry for how he treated me when we were kids. i was hurt when my sister started dating him two years ago, but i tolerate being around him when i absolutely have to for the sake of keeping the peace. \n\nhowever, i will not be attending their wedding or having anything 

In [None]:
input_ids = tokenizer(INSTRUCTION_PREFIX + sample["submission_text"], max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, return_tensors="pt", truncation=True).input_ids.cuda()

In [None]:
outputs = peft_model.generate(input_ids=input_ids, max_new_tokens=256, do_sample=True, top_p=0.9)


In [None]:
print(f"Output:\n{tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)}")

Output:
nta. that's a lot of emotional hurt and hurting people. you can't merely tell yourself that the person changed or that you should forget about it.


### Complete Testing Loop

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_model(model, sample):

    # tokenize input
    input_text = sample["submission_text"]
    input_ids = tokenizer(INSTRUCTION_PREFIX + input_text, max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, return_tensors="pt", truncation=True).input_ids.cuda()

    # generate and decode prediction
    outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)

    # get label
    label = sample['top_comment_1']

    # return prediction and label
    return input_text, prediction, label

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# load first N samples in test dataset
NUMBER_OF_SAMPLES = 100
test_dataset = dataset['test'].select(range(NUMBER_OF_SAMPLES))

# run predictions
input_texts, predictions, references = [] , [], []
for sample in tqdm(test_dataset):
    i,p,l = evaluate_model(peft_model, sample)
    input_texts.append(i)
    predictions.append(p)
    references.append(l)

100%|██████████| 100/100 [09:46<00:00,  5.86s/it]


In [None]:
# Compute ROGUE scores
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

rouge_scores = {
    'ROUGE-1': f"{rogue['rouge1'] * 100:.2f}%",
    'ROUGE-2': f"{rogue['rouge2'] * 100:.2f}%",
    'ROUGE-L': f"{rogue['rougeL'] * 100:.2f}%",
    'ROUGE-Lsum': f"{rogue['rougeLsum'] * 100:.2f}%"
}

print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

Rogue1: 22.199543%
rouge2: 2.807130%
rougeL: 14.382411%
rougeLsum: 15.763821%


In [None]:
import json

INSTRUCTION_PREFIX = "Classify the interpersonal conflict into one of the following categories. 'YTA' when the writer is causing the conflict. 'NTA' when the other person is causing the conflict. 'NAH' when both the writer and other person are not causing the conflict. 'ESH' when both the writer and other person are causing the conflict. 'INFO' if more information is needed for a judgement. Then, provide a short justification: "

results = {}
for i, (input_text, prediction, reference) in enumerate(zip(input_texts, predictions, references)):
    results[f'Sample {i+1}'] = {'Input Text': input_text, 'Prediction': prediction, 'Reference': reference}

final_output = {
    'Instruction Prefix': INSTRUCTION_PREFIX,
    'ROUGE Scores': rouge_scores,
    'Results': results,
}

with open('/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning/flanT5_xxl_400_samples_training_100_testing_samples_results.json', 'w') as file:
    json.dump(final_output, file, indent=4)
