# Fine-tuninig the LLM Model
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and important libraries

In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
import nltk
import torch

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
from evaluate import load

from pynvml import *

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
df = pd.read_csv("./dataset/BG3_reviews_preprocessed.csv")  # load the preprocessed version of the dataset
print(df.columns)
print(df.shape)

Index(['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score',
       'word_count', 'profanity'],
      dtype='object')
(10000, 7)


## Model

In [4]:
model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
# Load model if it alrady exists
model = AutoModelForCausalLM.from_pretrained("./model/v2").to(device)
tokenizer = AutoTokenizer.from_pretrained("./model/v2")

## Tokenization

In [13]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
train_dataset = Dataset.from_pandas(df)
train_dataset

Dataset({
    features: ['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score', 'word_count', 'profanity'],
    num_rows: 10000
})

In [15]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        self.tokenizer.truncation_side = "right"

        return self.tokenizer(
            examples["review"],
            max_length=512,
            truncation=True,
        )

In [28]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

tokenized_dataset = train_dataset.map(
    tokenizer_wrapper.tokenize_function,
    num_proc=4,
    remove_columns=train_dataset.column_names)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [43]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # We drop the small remainder
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [44]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11767
})

# Fine-tune model

In [45]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [46]:
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [47]:
def print_trainable_parameters(model):

    # Prints the number of trainable parameters in the model.

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [48]:
print_trainable_parameters(model)

trainable params: 589824 || all params: 125029632 || trainable%: 0.4717473694555863


In [49]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 125029632 || trainable%: 0.4717473694555863


In [51]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 5

In [52]:
training_args = TrainingArguments(
    output_dir= "gpt2-lora-review_generation",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
)

In [53]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

# train model
results = trainer.train()

  0%|          | 0/14710 [00:00<?, ?it/s]

{'loss': 4.273, 'learning_rate': 1.3790754496286927e-05, 'epoch': 0.17}
{'loss': 4.1017, 'learning_rate': 1.330550627333524e-05, 'epoch': 0.34}
{'loss': 4.0187, 'learning_rate': 1.2820258050383554e-05, 'epoch': 0.51}
{'loss': 3.9618, 'learning_rate': 1.2335009827431867e-05, 'epoch': 0.68}
{'loss': 3.929, 'learning_rate': 1.184976160448018e-05, 'epoch': 0.85}
{'loss': 3.9262, 'learning_rate': 1.1364513381528493e-05, 'epoch': 1.02}
{'loss': 3.9182, 'learning_rate': 1.0879265158576808e-05, 'epoch': 1.19}
{'loss': 3.8911, 'learning_rate': 1.0394016935625122e-05, 'epoch': 1.36}
{'loss': 3.8867, 'learning_rate': 9.908768712673435e-06, 'epoch': 1.53}
{'loss': 3.8707, 'learning_rate': 9.42352048972175e-06, 'epoch': 1.7}
{'loss': 3.8624, 'learning_rate': 8.938272266770063e-06, 'epoch': 1.87}
{'loss': 3.8538, 'learning_rate': 8.453024043818377e-06, 'epoch': 2.04}
{'loss': 3.8538, 'learning_rate': 7.967775820866688e-06, 'epoch': 2.21}
{'loss': 3.8437, 'learning_rate': 7.482527597915003e-06, 'epoc

## Results

In [54]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [55]:
print_gpu_utilization()


GPU memory occupied: 2460 MB.


In [56]:
print_summary(results)

Time: 964.17
Samples/second: 61.02
GPU memory occupied: 2468 MB.


In [59]:
# save model parameters
model.save_pretrained("./model/v3")
tokenizer.save_pretrained("./model/v3")

('./model/v3\\tokenizer_config.json',
 './model/v3\\special_tokens_map.json',
 './model/v3\\vocab.json',
 './model/v3\\merges.txt',
 './model/v3\\added_tokens.json',
 './model/v3\\tokenizer.json')

# Generate Reviews

In [57]:
generated_text = model.generate(max_length=100, do_sample=True, top_k=50, top_p=0.95)
print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



1 player, 3.5 hours of play. great game for a game like this, and i've played for years. its so simple. for the most people, you have a good, clear cut story, beautiful world, lots of quests and interesting character choices, many of the most important stuff that you're probably not used to. i'm so surprised this is even being released here. the best part is that it is currently priced at $29.99 for a 5 game game


In [58]:
prompt = 'i think that'
inputs = tokenizer(prompt, return_tensors="pt").input_ids
inputs = inputs.to(device)
generated_text = model.generate(input_ids=inputs, max_length=100, do_sample=True, top_k=50, top_p=0.95)

print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


i think that a person of the right age in a medium game of this quality, should make a real decision about the way they play.  of course they are going to like the games and we cannot imagine that any other single person would want to play as a young player.  and the game is great, so to say it was not a "larian" game was grossly under appreciated by me as the game is quite an interesting one and it seems like they could have put their money where


In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
# Perplexity: 49.61