# Code for Fine-Tuning LLMs

## Install necessary libraries

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install pandas
!pip install torch
!pip install requests
!pip install trl
!pip install tqdm
!pip install datasets
!pip install transformers

## Import necessary libraries

In [None]:
import re
import os
import torch
import pyperclip
import requests
import pandas as pd
from tqdm.auto import tqdm
from trl import SFTTrainer
from datetime import datetime
from google.colab import drive
from datasets import Dataset, load_dataset
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import TrainingArguments, EarlyStoppingCallback

In [None]:
# Mount Google Drive to access files and save results

drive.mount('/content/drive', force_remount=True)

In [None]:
############ UNCOMMENT THE CODE BELOW TO ENABLE TELEGRAM NOTIFICATIONS #############

# api_token = ""  # Provide your Telegram bot API token here
# chat_id = ""  # Provide the chat ID where notifications should be sent

In [None]:
############ UNCOMMENT AND USE THE FUNCTION BELOW TO ENABLE TELEGRAM NOTIFICATIONS #############

# def notify(text='Cell execution completed.'):
#     requests.post('https://api.telegram.org/' + 'bot{}/sendMessage'.format(api_token), params=dict(chat_id=chat_id, text=text))

## Load Model

##### Pre-trained Model and Tokenizer from Hugging Face

The pre-trained model and tokenizer will be loaded from [Hugging Face](https://huggingface.co/)

In [None]:
model_name = "your_model_name_here"

In [None]:
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=1024,
        dtype=None,
        load_in_4bit=True,
    )
    print(f"Model: {model_name} and tokenizer loaded successfully.")
    # Uncomment the line below to enable Telegram notifications when the model and tokenizer are loaded
    # notify(f"Model: {model_name} and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error occurred while loading the model or tokenizer: {e}")
    # Uncomment the line below to enable Telegram notifications for errors during model or tokenizer loading
    # notify(f"Error occurred while loading the model or tokenizer: {e}")

##### Apply LoRA (Low-Rank Adaptation) to the loaded model for fine-tuning

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none", 
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None, 
)

## Load Dataset

In [None]:
def load_csv_to_dataset(file_path):
    df = pd.read_csv(file_path)
    dataset = pd.DataFrame(df)
    return dataset

In [None]:
train_dataset = load_csv_to_dataset(f"../datasets/PEM-Rel-8k/PEM-Rel-8k-Train.csv")
val_dataset = load_csv_to_dataset(f"../datasets/PEM-Rel-8k/PEM-Rel-8k-Val.csv")
test_dataset = load_csv_to_dataset(f"../datasets/PEM-Rel-8k/PEM-Rel-8k-Test.csv")

##### The function `analyse_dataset` provides insights into a dataset by displaying its shape, information, descriptive statistics, and category counts for the `simplified_label` column.

In [None]:
def analyse_dataset(dataset, name):
    print(f"{name} Dataset Analysis:\n")
    df = pd.DataFrame(dataset)
    print(f"Shape: {df.shape}")
    df.info()
    print(f"\nDescriptive Statistics:\n", df.describe(include='all'))
    print(f"\n'simplified_label' Category Counts:\n", df['simplified_label'].value_counts())
    print("-" * 50)

In [None]:
analyse_dataset(train_dataset, "Train")
analyse_dataset(test_dataset, "Test")
analyse_dataset(val_dataset, "Validation")

## Define Prompt

##### To ensure consistency and fairness in comparison, the same `prompt_template` is used across all experiments, including fine-tuning and cross-domain evaluation.

In [None]:
prompt_template = '''
Classify the relationship between '[CONCEPT-A]' and '[CONCEPT-B]'
'''

In [None]:
prompt_in_use = prompt_template
print(prompt_in_use)

##### The function `create_conversation` generates a structured conversation format for each row in the training and validation set by replacing placeholders (`[CONCEPT-A]` and `[CONCEPT-B]`) in the `prompt_in_use` template with the corresponding subject and object values.

In [None]:
def create_conversation(row):
    sub = row['subject']
    obj = row['object']
    label = row['simplified_label']

    human = prompt_in_use.replace("[CONCEPT-A]", sub).replace("[CONCEPT-B]", obj)

    gpt = f"relationship: {label}"



    return [
        {"from": "human", "value": human},
        {"from": "gpt", "value": gpt},
    ]

##### The function `test_create_conversation` generates a structured conversation format for each row in the test dataset by replacing placeholders (`[CONCEPT-A]` and `[CONCEPT-B]`) in the `prompt_in_use` template with the corresponding subject and object values.

In [None]:
def test_create_conversation(row):
    sub = row['subject']
    obj = row['object']

    human = prompt_in_use.replace("[CONCEPT-A]", sub).replace("[CONCEPT-B]", obj)

    return [{"from": "human", "value": human}]

##### Generate a 'conversations' column in the datasets by applying the `create_conversation` and `test_create_conversation` functions to each row

In [None]:
train_dataset['conversations'] = train_dataset.apply(create_conversation, axis=1)
test_dataset['conversations'] = test_dataset.apply(test_create_conversation, axis=1)
val_dataset['conversations'] = val_dataset.apply(create_conversation, axis=1)

##### The function `print_first_conversation` displays the first conversation from the `conversations` column of the dataset.

In [None]:
def print_first_conversation():

    for dataset_name, dataset in zip(["train", "test", "val"], [train_dataset, test_dataset, val_dataset]):
        try:
            first_conversation = dataset['conversations'][0]
            print(f"{dataset_name} dataset:")
            print(first_conversation)
        except (IndexError, KeyError) as e:
            print(f"Error accessing 'conversations' in {dataset_name} dataset: {e}")

In [None]:
print_first_conversation()

##### The function `formatting_prompts_func` applies a chat template to format the conversations in the dataset.

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-4", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

## Data Preparation

In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)
val_dataset = Dataset.from_pandas(val_dataset)


train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

##### The function `mapped_data_analysis` analyses the mapped dataset by displaying its structure, descriptive statistics, and category counts.

In [None]:
def mapped_data_analysis(dataset, name):
    print(f"{name} Dataset Analysis:\n")
    df = dataset.to_pandas()
    df.info()
    print(f"\nDescriptive Statistics:\n", df.describe(include='all'))
    print(f"\n'simplified_label' Category Counts:\n", df['simplified_label'].value_counts())
    print("-" * 50)

In [None]:
mapped_data_analysis(train_dataset, "Train")
mapped_data_analysis(val_dataset, "Validation")
mapped_data_analysis(test_dataset, "Test")

## Train the Model

##### We use Huggingface TRL's `SFTTrainer` for fine-tuning! For detailed documentation, visit [TRL SFT Trainer Documentation](https://huggingface.co/docs/trl/sft_trainer). Here, we perform 60 steps for quicker execution. For a complete training run, you can set `num_train_epochs=1` and remove the `max_steps` parameter by setting it to `None`.

In [None]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.01
)

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = 1024,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        evaluation_strategy = "steps",
        eval_steps = 10,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        metric_for_best_model="eval_loss",
        output_dir = "../results/models/",
        report_to = "none",
    ),

)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Train the model using the SFTTrainer instance and store the training statistics
trainer_stats = trainer.train()
print(f"Training for {model_name} is completed.")

# Uncomment the line below to send a notification via Telegram when the training process is completed
# notify(f"Training for {model_name} is completed.")

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("../results/models/")
tokenizer.save_pretrained("../results/models/")

In [None]:
# Convert the test dataset to a pandas DataFrame
test_dataset_converted = test_dataset.to_pandas()
test_dataset_converted

In [None]:
prompt = test_dataset_converted['text'].loc[len(test_dataset)-1]
prompt

## Model Inference

In [None]:
# Prepares the model for inference by setting it to inference mode
FastLanguageModel.for_inference(model);

In [None]:
# Tokenize the input prompt and convert it into a format suitable for the model.
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [None]:
# Generate test output from the model using the tokenized input.
outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)

# Decode the generated tokens into a human-readable string, skipping special tokens.
text_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
text_output

##### The function `extract_relationship_label` identifies and extracts the relationship label from the output returned by model.

In [None]:
def extract_relationship_label(text):
  if prompt_in_use == prompt_template:
    words = ["broader", "narrower", "same-as", "other"]

    for word in words:
      if re.search(r"\b" + re.escape(word) + r"\b", text, re.IGNORECASE):
        return word

  else: return "Relation not found!"

In [None]:
relationship_label = extract_relationship_label(text_output)
print(relationship_label)

##### The function `get_relationship` performs inference on a single data point, extracting the complete output and predicted relationship label.

In [None]:
def get_relationship(data_point,model,tokenizer):

    FastLanguageModel.for_inference(model)

    inputs = tokenizer(data_point['text'], return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
    text_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    data_point['complete_output'] = text_output
    data_point['predicted_label'] = extract_relationship_label(text_output)

    return data_point

In [None]:
# Apply the `get_relationship` function to each row of the `test_dataset_converted` DataFrame.
tqdm.pandas()
test_dataset_converted = test_dataset_converted.progress_apply(lambda row: get_relationship(row, model, tokenizer), axis=1)

print(f"Inference for {model_name} is completed.")

# Uncomment the line below to send a notification via Telegram when the inference process is completed.
# notify(f"Inference for {model_name} is completed.") 

In [None]:
test_dataset_converted

## Save the Output

In [None]:
run_type = "PEM-Rel-8K"

In [None]:
file_name = f"{run_type}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
print(file_name)

In [None]:
test_dataset_converted.to_csv(f'../results/{file_name}.csv', index=False)