# Fine-tuninig the LLM Model
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and important libraries

In [34]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
import nltk
import torch

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
from evaluate import load

from pynvml import *

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
df = pd.read_csv("./dataset/BG3_reviews_preprocessed.csv")  # load the preprocessed version of the dataset
print(df.columns)
print(df.shape)

Index(['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score',
       'word_count', 'profanity'],
      dtype='object')
(10000, 7)


## Model

In [4]:
model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name, ).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
# # Load model if it alrady exists
# model = AutoModelForCausalLM.from_pretrained("./model/v2").to(device)
# tokenizer = AutoTokenizer.from_pretrained("./model/v2")

## Tokenization

In [6]:
# add pad token if none exists
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))

tokenizer.pad_token = tokenizer.eos_token

In [7]:
train_dataset = Dataset.from_pandas(df)
train_dataset

Dataset({
    features: ['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score', 'word_count', 'profanity'],
    num_rows: 10000
})

In [8]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        self.tokenizer.truncation_side = "right"

        return self.tokenizer(
            examples["review"],
            max_length=512,
            truncation=True,
        )


In [9]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

tokenized_dataset = train_dataset.map(
    tokenizer_wrapper.tokenize_function,
    num_proc=4,
    remove_columns=train_dataset.column_names)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 10000
})

In [11]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

# Fine-tune model

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [14]:
def print_trainable_parameters(model):

    # Prints the number of trainable parameters in the model.

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [15]:
print_trainable_parameters(model)

trainable params: 124439808 || all params: 124439808 || trainable%: 100.0


In [16]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 125029632 || trainable%: 0.4717473694555863




In [17]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 5

In [18]:
training_args = TrainingArguments(
    output_dir= "gpt2-lora-review_generation",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    # save_strategy="epoch",
    # load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_dataset["train"],
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

# train model
results = trainer.train()

  0%|          | 0/14710 [00:00<?, ?it/s]

Checkpoint destination directory gpt2-lora-review_generation\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.8665, 'learning_rate': 0.0009660095173351462, 'epoch': 0.17}
{'loss': 3.7741, 'learning_rate': 0.0009320190346702923, 'epoch': 0.34}
{'loss': 3.7302, 'learning_rate': 0.0008980285520054386, 'epoch': 0.51}
{'loss': 3.6985, 'learning_rate': 0.0008640380693405847, 'epoch': 0.68}
{'loss': 3.6728, 'learning_rate': 0.0008300475866757308, 'epoch': 0.85}
{'loss': 3.671, 'learning_rate': 0.0007960571040108769, 'epoch': 1.02}
{'loss': 3.6519, 'learning_rate': 0.0007620666213460231, 'epoch': 1.19}
{'loss': 3.6329, 'learning_rate': 0.0007280761386811693, 'epoch': 1.36}
{'loss': 3.6357, 'learning_rate': 0.0006940856560163155, 'epoch': 1.53}
{'loss': 3.6237, 'learning_rate': 0.0006600951733514616, 'epoch': 1.7}
{'loss': 3.608, 'learning_rate': 0.0006261046906866078, 'epoch': 1.87}
{'loss': 3.597, 'learning_rate': 0.0005921142080217539, 'epoch': 2.04}
{'loss': 3.5779, 'learning_rate': 0.0005581237253569001, 'epoch': 2.21}
{'loss': 3.5726, 'learning_rate': 0.0005241332426920462, 'epoch': 2.

## Results

In [20]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [23]:
print_gpu_utilization()


GPU memory occupied: 535 MB.


In [22]:
print_summary(results)

Time: 1773.65
Samples/second: 33.17
GPU memory occupied: 2180 MB.


In [24]:
# save model parameters
model.save_pretrained("./model/v2")
tokenizer.save_pretrained("./model/v2")

('./model/v2\\tokenizer_config.json',
 './model/v2\\special_tokens_map.json',
 './model/v2\\vocab.json',
 './model/v2\\merges.txt',
 './model/v2\\added_tokens.json',
 './model/v2\\tokenizer.json')

# Generate Reviews

In [30]:
generated_text = model.generate(max_length=100, do_sample=True, top_k=50, top_p=0.95)
print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



i just have one review for this game that has me wondering how it looks, and in the end it feels like the game is more finished than i expected. i have already got into it and am still in early access so it is buggy, and it seems like this isn't going to be the final full release or release. i don't have a large budget, or something to play with, but the devs really managed to put this game through their paces with amazing design and gameplay


In [32]:
prompt = 'i think that'
inputs = tokenizer(prompt, return_tensors="pt").input_ids
inputs.to(device)
generated_text = model.generate(input_ids= inputs, max_length=100, do_sample=True, top_k=50, top_p=0.95)

print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


i think that it really is what they want to do. the music is so great (i have played over 70 hours), the voice actors are so much fun and are making you cry.  that is what these games need to be. they just need to do more with their time  but it isn't enough for a studio. they can have a good time right? at least this is the way they are going.a perfect example of the kind of thing you can expect if you buy a


In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
# Perplexity: 49.61