# Fine-tuninig the LLM Model
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and important libraries

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, get_scheduler

import pandas as pd
import numpy as np

# from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("dataset/BG3_reviews_preprocessed.csv")  # load the preprocessed version of the dataset
print(df.columns)
print(df.shape)

## Model

In [3]:
model_name = 'distilbert-base-uncased'
model = AutoModelForCausalLM.from_pretrained(model_name)

ValueError: Unrecognized configuration class <class 'transformers.models.distilbert.configuration_distilbert.DistilBertConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

## Tokenization

In [None]:
class CustomDataset(Dataset):
    def __init__(self, text_list, tokenizer, max_length=512):
        self.input_ids = tokenizer(text_list, truncation=True, max_length=max_length, return_tensors="pt", padding=True)

    def __len__(self):
        return len(self.input_ids["input_ids"])

    def __getitem__(self, idx):
        return {"input_ids": self.input_ids["input_ids"][idx]}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
batch_size = 128

train_dataset = CustomDataset(df['review'], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# val_dataset = CustomDataset(val_text_list, tokenizer)
# val_loader = DataLoader(val_dataset, batch_size=your_batch_size, shuffle=False)

## Train and Evaluation Functions

In [None]:
def train(model, lr, num_epochs):
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler("linear", optimizer)

    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        for batch in train_loader:
            inputs = batch["input_ids"].to(device)
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")


In [None]:
# add evaluation functions here - perplexity, WER, GLUE, ...

# Fine-tune model

In [None]:
# save model parameters

model.save_pretrained("your_fine_tuned_model_directory")
tokenizer.save_pretrained("your_fine_tuned_model_directory")

# Generate Reviews

In [None]:
generated_text = model.generate(max_length=100)
print()

In [None]:
# other method (?)

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# # add pad token if none exists
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))
# # create tokenize function
# def tokenize_function(examples):
#     # extract text
#     text = examples["text"]

#     #tokenize and truncate text
#     tokenizer.truncation_side = "left"
#     tokenized_inputs = tokenizer(
#         text,
#         return_tensors="np",
#         truncation=True,
#         max_length=512
#     )

#     return tokenized_inputs
# # tokenize training and validation datasets
# tokenized_dataset = dataset.map(tokenize_function, batched=True)
# tokenized_dataset
# # create data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# other method (?)

# peft_config = LoraConfig(task_type="SEQ_CLS",
#                         r=4,
#                         lora_alpha=32,
#                         lora_dropout=0.01,
#                         target_modules = ['q_lin'])
# peft_config
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()
# # hyperparameters
# lr = 1e-3
# batch_size = 4
# num_epochs = 10
# # define training arguments
# training_args = TrainingArguments(
#     output_dir= model_checkpoint + "-lora-text-classification",
#     learning_rate=lr,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=num_epochs,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )
# # creater trainer object
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["validation"],
#     tokenizer=tokenizer,
#     data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
#     compute_metrics=compute_metrics,
# )

# # train model
# trainer.train()