<a href="https://colab.research.google.com/github/MattBoraske/Reddit_AITA_LLMs/blob/main/LLAMA_2_AITA_Peft_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformers accelerate torch evaluate datasets rouge_score peft bitsandbytes tensorboard py7zr

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manyli

In [None]:
# mount gdrive to save results
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning

/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning


In [None]:
from datasets import load_dataset

dataset = load_dataset("MattBoraske/AITA_subreddit_submissions_flanT5_filtered")

In [None]:
AITA_classifications = [
    "\nYou're The A**hole (YTA) when the first person (you, the writer of the conflict context) is causing the conflict.",
    "\nNot The A**hole (NTA) when a third person party (anyone but the writer of the conflict context) is causing the conflict.",
    "\nNo A**holes Here (NAH) when no parties are causing the conflict.",
    "\nEveryone Sucks Here (ESH) when all parties are causing the conflict.",
    "\nMore Information Needed (INFO) when a classification can not be classified using the conflict context."
]

INSTRUCTION_PREFIX = f"Classify the interpersonal conflict into one of the following categories and provide a justification for your choice. The categories are: {''.join(AITA_classifications)}\n\nConflict Context: "

In [None]:
def preprocess_function(sample):
    # add prefix to the input for t5
    inputs = [INSTRUCTION_PREFIX + item for item in sample["submission_text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=2048, padding='max_length', truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["top_comment_1"], max_length=512, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")

pad_token = '[PAD]'
if tokenizer.pad_token != pad_token:
    tokenizer.add_special_tokens({'pad_token': pad_token})
tokenizer.padding_side = "left"

In [None]:
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-13b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["submission_title", "submission_text", "decision", "submission_score", "submission_url", "submission_date", "top_comment_1", 'top_comment_2', 'top_comment_3', 'top_comment_4', 'top_comment_5', 'top_comment_6', 'top_comment_7', 'top_comment_8', 'top_comment_9', 'top_comment_10', 'top_comment_1_classification', 'top_comment_2_classification', 'top_comment_3_classification', 'top_comment_4_classification', 'top_comment_5_classification', 'top_comment_6_classification', 'top_comment_7_classification', 'top_comment_8_classification', 'top_comment_9_classification', 'top_comment_10_classification', 'ambiguity_score'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


### Load model and LORA config

In [None]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", device_map = "auto", quantization_config=quant_config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 5120)

In [None]:
import torch

def get_model_memory_size(model):
    total_size = 0
    for param in model.parameters():
        # param.nelement() gives the total number of elements in the tensor,
        # param.element_size() gives the size in bytes of each element in the tensor.
        total_size += param.nelement() * param.element_size()
    return total_size

def get_model_memory_size_gb(model):
    total_size_bytes = get_model_memory_size(model)
    total_size_gb = total_size_bytes / (1024 ** 3)  # Convert bytes to gigabytes
    return total_size_gb

print(f"Model memory size: {get_model_memory_size_gb(model)} GB")

Model memory size: 6.519346237182617 GB


In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["self_attn.q_proj", "self_attn.v_proj"],  # Adjusted target modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
peft_model.to("cuda")



trainable params: 13,107,200 || all params: 13,028,981,760 || trainable%: 0.100600340390683


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 5120)
        (layers): ModuleList(
          (0-39): 40 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=F

In [None]:
from transformers import DataCollatorForLanguageModeling

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = 32000
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

In [None]:
data_collator

DataCollatorForLanguageModeling(tokenizer=LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-13b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=8, tf_experimental_compile=False, return_tensors='pt')

In [None]:
first_1000_samples = tokenized_dataset['train'].select(range(1000))

In [None]:
first_1000_samples

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [None]:
from transformers import Trainer, TrainingArguments

output_dir="/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning/aita-llama2-13b-chat-hf-peft"

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
		auto_find_batch_size=True,
    learning_rate=5e-4,
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Trainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=first_1000_samples,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# train model
trainer.train()



Step,Training Loss
20,1.7903
40,1.6084
60,1.6018
80,1.5481
100,1.5525
120,1.5468


TrainOutput(global_step=125, training_loss=1.6069419364929198, metrics={'train_runtime': 9463.2796, 'train_samples_per_second': 0.106, 'train_steps_per_second': 0.013, 'total_flos': 1.5808679903232e+17, 'train_loss': 1.6069419364929198, 'epoch': 1.0})

In [None]:
MODEL_NAME = "llama2-13b-chat-hf-AITA-peft-adaptor"
peft_model.push_to_hub(f"MattBoraske/{MODEL_NAME}")



adapter_model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MattBoraske/llama2-13b-chat-hf-AITA-peft-adaptor/commit/5cc4f9f760ec8f2f9474183fb94d73625ef0b61c', commit_message='Upload model', commit_description='', oid='5cc4f9f760ec8f2f9474183fb94d73625ef0b61c', pr_url=None, pr_revision=None, pr_num=None)

## Testing

In [None]:
peft_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 5120)
        (layers): ModuleList(
          (0-39): 40 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=F

In [None]:
INSTRUCTION_PREFIX

"Classify the interpersonal conflict into one of the following categories and provide a justification for your choice. The categories are: \nYou're The A**hole (YTA) when the first person (you, the writer of the conflict context) is causing the conflict.\nNot The A**hole (NTA) when a third person party (anyone but the writer of the conflict context) is causing the conflict.\nNo A**holes Here (NAH) when no parties are causing the conflict.\nEveryone Sucks Here (ESH) when all parties are causing the conflict.\nMore Information Needed (INFO) when a classification can not be classified using the conflict context.\n\nConflict Context: "

### Single sample test

In [None]:
from random import randrange
sample = dataset['test'][randrange(len(dataset["test"]))]
sample

{'submission_title': "AITA for refusing to have anything to do with my sister's wedding bc of who her fiancé was as a teenager?",
 'submission_text': 'i\'ll keep this as short as i can my [28m] twin sister [28f] recently got engaged to a guy [30m] who i\'ll call terry. in secondary school terry was a year above us and he and his friends bullied me *relentlessly*. i\'ll spare you the finer details, but i\'m talking "caused me lifelong psychological issues" type of shit. granted it was never physical but it was a literal daily onslaught of psychological and verbal abuse, to the point where i bunked off probably half of years 9 and 10 just so i could avoid it. i\'m not a hateful or spiteful person, but i absolutely despise terry for how he treated me when we were kids. i was hurt when my sister started dating him two years ago, but i tolerate being around him when i absolutely have to for the sake of keeping the peace. \n\nhowever, i will not be attending their wedding or having anything 

In [None]:
input_ids = tokenizer(INSTRUCTION_PREFIX + sample["submission_text"], return_tensors="pt").input_ids.cuda()

In [None]:
outputs = peft_model.generate(input_ids=input_ids, max_new_tokens=256, do_sample=True, top_p=0.9)


In [None]:
print(INSTRUCTION_PREFIX + sample["submission_text"])

Classify the interpersonal conflict into one of the following categories and provide a justification for your choice. The categories are: 
You're The A**hole (YTA) when the first person (you, the writer of the conflict context) is causing the conflict.
Not The A**hole (NTA) when a third person party (anyone but the writer of the conflict context) is causing the conflict.
No A**holes Here (NAH) when no parties are causing the conflict.
Everyone Sucks Here (ESH) when all parties are causing the conflict.
More Information Needed (INFO) when a classification can not be classified using the conflict context.

Conflict Context: i'll keep this as short as i can my [28m] twin sister [28f] recently got engaged to a guy [30m] who i'll call terry. in secondary school terry was a year above us and he and his friends bullied me *relentlessly*. i'll spare you the finer details, but i'm talking "caused me lifelong psychological issues" type of shit. granted it was never physical but it was a literal 

In [None]:
print(f"Output:\n{tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)}")

Output:
Classify the interpersonal conflict into one of the following categories and provide a justification for your choice. The categories are: 
You're The A**hole (YTA) when the first person (you, the writer of the conflict context) is causing the conflict.
Not The A**hole (NTA) when a third person party (anyone but the writer of the conflict context) is causing the conflict.
No A**holes Here (NAH) when no parties are causing the conflict.
Everyone Sucks Here (ESH) when all parties are causing the conflict.
More Information Needed (INFO) when a classification can not be classified using the conflict context.

Conflict Context: i'll keep this as short as i can my [28m] twin sister [28f] recently got engaged to a guy [30m] who i'll call terry. in secondary school terry was a year above us and he and his friends bullied me *relentlessly*. i'll spare you the finer details, but i'm talking "caused me lifelong psychological issues" type of shit. granted it was never physical but it was a 

### Complete Testing Loop

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_model(model, sample):

    # tokenize input
    input_text = sample["submission_text"]
    input_ids = tokenizer(INSTRUCTION_PREFIX + input_text, max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, return_tensors="pt", truncation=True).input_ids.cuda()

    # generate and decode prediction
    outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)

    # get label
    label = sample['top_comment_1']

    # return prediction and label
    return input_text, prediction, label

In [None]:
# load first N samples in test dataset
NUMBER_OF_SAMPLES = 100
test_dataset = dataset['test'].select(range(NUMBER_OF_SAMPLES))

# run predictions
input_texts, predictions, references = [] , [], []
for sample in tqdm(test_dataset):
    i,p,l = evaluate_model(peft_model, sample)
    input_texts.append(i)
    predictions.append(p)
    references.append(l)

100%|██████████| 100/100 [03:28<00:00,  2.08s/it]


In [None]:
# Compute ROGUE scores
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

rouge_scores = {
    'ROUGE-1': f"{rogue['rouge1'] * 100:.2f}%",
    'ROUGE-2': f"{rogue['rouge2'] * 100:.2f}%",
    'ROUGE-L': f"{rogue['rougeL'] * 100:.2f}%",
    'ROUGE-Lsum': f"{rogue['rougeLsum'] * 100:.2f}%"
}

print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

Rogue1: 20.448413%
rouge2: 2.052818%
rougeL: 13.305291%
rougeLsum: 14.592986%


In [None]:
import json

INSTRUCTION_PREFIX = "Classify the interpersonal conflict into one of the following categories. 'YTA' when the writer is causing the conflict. 'NTA' when the other person is causing the conflict. 'NAH' when both the writer and other person are not causing the conflict. 'ESH' when both the writer and other person are causing the conflict. 'INFO' if more information is needed for a judgement. Then, provide a short justification: "

results = {}
for i, (input_text, prediction, reference) in enumerate(zip(input_texts, predictions, references)):
    results[f'Sample {i+1}'] = {'Input Text': input_text, 'Prediction': prediction, 'Reference': reference}

final_output = {
    'Instruction Prefix': INSTRUCTION_PREFIX,
    'ROUGE Scores': rouge_scores,
    'Results': results,
}

with open('/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning/flanT5_small_peft_10000_training_samples_testing_100_samples_results.json', 'w') as file:
    json.dump(final_output, file, indent=4)
