# Fine-tuninig the LLM Model
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and important libraries

In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
import nltk
import torch

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
from evaluate import load

from pynvml import *

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
df = pd.read_csv("./dataset/BG3_reviews_preprocessed.csv")  # load the preprocessed version of the dataset
print(df.columns)
print(df.shape)

Index(['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score',
       'word_count', 'profanity'],
      dtype='object')
(10000, 7)


## Model

In [4]:
model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# Load model if it alrady exists
# model = AutoModelForCausalLM.from_pretrained("./model/v3").to(device)
# tokenizer = AutoTokenizer.from_pretrained("./model/v3")

## Tokenization

In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [5]:
train_dataset = Dataset.from_pandas(df)
train_dataset

Dataset({
    features: ['review', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score', 'word_count', 'profanity'],
    num_rows: 10000
})

In [6]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        self.tokenizer.truncation_side = "right"

        return self.tokenizer(
            examples["review"],
            max_length=512,
            truncation=True,
        )

In [7]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

tokenized_dataset = train_dataset.map(
    tokenizer_wrapper.tokenize_function,
    num_proc=4,
    remove_columns=train_dataset.column_names)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
# def group_texts(examples):
#     block_size = 128
#     # Concatenate all texts.
#     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
#     total_length = len(concatenated_examples[list(examples.keys())[0]])

#     # We drop the small remainder
#     if total_length >= block_size:
#         total_length = (total_length // block_size) * block_size

#     # Split by chunks of block_size.
#     result = {
#         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
#         for k, t in concatenated_examples.items()
#     }
#     result["labels"] = result["input_ids"].copy()
#     return result

# tokenized_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

In [9]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 10000
})

## Fine-tune model

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [12]:
def print_trainable_parameters(model):

    # Prints the number of trainable parameters in the model.

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [13]:
print_trainable_parameters(model)

trainable params: 0 || all params: 125029632 || trainable%: 0.0


In [14]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 125029632 || trainable%: 0.4717473694555863


In [15]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 5

In [16]:
training_args = TrainingArguments(
    output_dir= "gpt2-lora-review_generation",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

# train model
results = trainer.train()

  0%|          | 0/12500 [00:00<?, ?it/s]

Checkpoint destination directory gpt2-lora-review_generation\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.9106, 'learning_rate': 9.6e-05, 'epoch': 0.2}


Checkpoint destination directory gpt2-lora-review_generation\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.7341, 'learning_rate': 9.200000000000001e-05, 'epoch': 0.4}
{'loss': 3.7167, 'learning_rate': 8.800000000000001e-05, 'epoch': 0.6}
{'loss': 3.6707, 'learning_rate': 8.4e-05, 'epoch': 0.8}
{'loss': 3.6504, 'learning_rate': 8e-05, 'epoch': 1.0}
{'loss': 3.6355, 'learning_rate': 7.6e-05, 'epoch': 1.2}
{'loss': 3.6331, 'learning_rate': 7.2e-05, 'epoch': 1.4}
{'loss': 3.6378, 'learning_rate': 6.800000000000001e-05, 'epoch': 1.6}
{'loss': 3.5981, 'learning_rate': 6.400000000000001e-05, 'epoch': 1.8}
{'loss': 3.6129, 'learning_rate': 6e-05, 'epoch': 2.0}
{'loss': 3.5854, 'learning_rate': 5.6000000000000006e-05, 'epoch': 2.2}
{'loss': 3.598, 'learning_rate': 5.2000000000000004e-05, 'epoch': 2.4}
{'loss': 3.5874, 'learning_rate': 4.8e-05, 'epoch': 2.6}
{'loss': 3.5846, 'learning_rate': 4.4000000000000006e-05, 'epoch': 2.8}
{'loss': 3.5922, 'learning_rate': 4e-05, 'epoch': 3.0}
{'loss': 3.5474, 'learning_rate': 3.6e-05, 'epoch': 3.2}
{'loss': 3.583, 'learning_rate': 3.2000000000000005

## Results

In [25]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [26]:
print_gpu_utilization()


GPU memory occupied: 6541 MB.


In [27]:
print_summary(results)

Time: 2613.57
Samples/second: 19.13
GPU memory occupied: 6547 MB.


In [21]:
# save model parameters
model.save_pretrained("./model/v4")
tokenizer.save_pretrained("./model/v4")

('./model/v4\\tokenizer_config.json',
 './model/v4\\special_tokens_map.json',
 './model/v4\\vocab.json',
 './model/v4\\merges.txt',
 './model/v4\\added_tokens.json',
 './model/v4\\tokenizer.json')

## Generate Reviews

In [22]:
from transformers.utils import logging
import transformers

logging.set_verbosity(transformers.logging.ERROR)

In [23]:
generated_text = model.generate(max_length=100, do_sample=True, top_k=50, top_p=0.95)
print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

 the best game i've played. most likely the best and most played so far and i think i will continue to play this game ever since i found i could make an honest living from what i have made and it will be a wonderful experience for everyone and can't wait for the final release. for the true epic, i cant wait to see it. that's good in my book!  great for people who would like to see a game that is truly unique in one of those rare times


In [28]:
prompt = 'i think that'
inputs = tokenizer(prompt, return_tensors="pt").input_ids
inputs = inputs.to(device)
generated_text = model.generate(input_ids=inputs, max_length=100, do_sample=True, top_k=50, top_p=0.95)

print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

i think that the voice acting in baldurs gate 3 is so good it's worth it, the characters are engaging, and the story is all the more amazing for its story.   i'm just starting out and not the one i'm hoping for in the future. this version seems to have gotten me thinking so much that it was only in early access that i played the beta and got the game's release out in early access.  if you're thinking of playing baldurs gate 3 and
