In [1]:
%cd ..

/data/home/eak/learning/llm_finetuning/specializing-llm-telecom


In [2]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "unsloth/Phi-3-mini-4k-instruct",
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.438 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu118. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1+cu118. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model = FastLanguageModel.get_peft_model(
	model,
	r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
	target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
					  "gate_proj", "up_proj", "down_proj",],
	lora_alpha = 16,
	lora_dropout = 0, # Supports any, but = 0 is optimized
	bias = "none",    # Supports any, but = "none" is optimized
	# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
	use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
	random_state = 3407,
	use_rslora = False,  # We support rank stabilized LoRA
	loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32064, 3072)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_featu

In [7]:
from datasets import Dataset

def create_dataset(data: dict):
	def patch_raw(raw: dict):
		for i in range(2, 6):
			raw[f"option {i}"] = raw.get(f"option {i}")
		return raw
	data_pashed = [
		patch_raw(raw) for raw in data.values()
	]
	data_pashed = Dataset.from_list(data_pashed)
	return data_pashed

In [8]:
training = json.load(open("data/zindi_data/TeleQnA_training.json"))
testing = json.load(open("data/zindi_data/TeleQnA_testing1.json"))
extra_dataset = json.load(open("data/TeleQnA.json"))


len(training), len(testing), len(extra_dataset)

(1461, 366, 10000)

In [9]:
training_ds = create_dataset(training)
testing_ds = create_dataset(testing)
extra_ds = create_dataset(extra_dataset)

len(training_ds), len(testing_ds), len(extra_ds)

(1461, 366, 10000)

In [10]:
training_ds

Dataset({
    features: ['question', 'option 1', 'option 2', 'option 3', 'option 4', 'answer', 'explanation', 'category', 'option 5'],
    num_rows: 1461
})

In [11]:
training_ds[:5]

{'question': ['What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]',
  'How does a supporting UE attach to the same core network operator from which it detached in a shared network? [3GPP Release 17]',
  'When can the setting of the Privacy exception list be changed? [3GPP Release 17]',
  'What does the NEF notify to the AF after determining the suitable DNAI(s)? [3GPP Release 18]',
  'In online charging, how are chargeable events transformed into charging events? [3GPP Release 18]'],
 'option 1': ['To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints',
  'It requests the core network node to remember its previous selection.',
  'Never',
  'AF Identifier',
  'By the CTF'],
 'option 2': ['To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints',
  'It uses information stored in the UE when it was detached.',
  'Only during emergency 

In [12]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

OPTIONS = [f"option {i}" for i in range(1, 6)]
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples: dict[str, str]):
	def apply_one(question, answer, category, *options):
		instructions = f"Domain: {category}: {question}"
		inputs       = "\n".join([i for i in options if i is not None])
		outputs      = answer
		return alpaca_prompt.format(instructions, inputs, outputs)
	texts = [apply_one(question, answer, category, *options) for question, answer, category, *options in zip(
		examples["question"], examples["answer"],  examples['option 1'], examples['option 2'], examples['option 3'], examples['option 4'], examples['option 5']
	)]
	return { "text" : texts, }

In [13]:
training_ds = training_ds.map(formatting_prompts_func, batched=True)
extra_ds = extra_ds.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1461 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [14]:
training_ds

Dataset({
    features: ['question', 'option 1', 'option 2', 'option 3', 'option 4', 'answer', 'explanation', 'category', 'option 5', 'text'],
    num_rows: 1461
})

In [15]:
extra_ds

Dataset({
    features: ['question', 'option 1', 'option 2', 'option 3', 'option 4', 'answer', 'explanation', 'category', 'option 5', 'text'],
    num_rows: 10000
})

In [16]:
extra_ds[:5]["text"]

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nDomain: To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints: What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]\n\n### Input:\nTo configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints\nTo supply data or analytics from the MFAF to notification endpoints\nTo fetch data or analytics from the MFAF based on fetch instructions\n\n### Response:\noption 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints',
 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nDomain: NOMA: W

In [17]:
trainer = SFTTrainer(
	model = model,
	tokenizer = tokenizer,
	train_dataset = extra_ds,
	eval_dataset = training_ds,
	dataset_text_field = "text",
	max_seq_length = max_seq_length,
	dataset_num_proc = 2,
	packing = False, # Can make training 5x faster for short sequences.
	args = TrainingArguments(
        output_dir="data/models",
        run_name="qa_telcom",
		per_device_train_batch_size = 2,
		gradient_accumulation_steps = 4,
		warmup_ratio = 0.05,
		learning_rate = 2e-4,
		fp16 = not torch.cuda.is_bf16_supported(),
		bf16 = torch.cuda.is_bf16_supported(),
		logging_steps = 1,
		optim = "adamw_8bit",
		weight_decay = 0.01,
		lr_scheduler_type = "linear",
		seed = 3407,
		output_dir = "outputs",
		push_to_hub=False,
		auto_find_batch_size=True,
		num_train_epochs=4,
		save_total_limit=1,
		eval_strategy="epoch",
		save_strategy="epoch",
		report_to="wandb",
		data_seed=41,
		load_best_model_at_end=True,
		metric_for_best_model="loss"
	),
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1461 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [18]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX A6000. Max memory = 47.438 GB.
2.283 GB of memory reserved.


In [19]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,2.4663
2,2.6217
3,2.4965
4,2.4537
5,2.1375
6,1.7752
7,1.8568
8,1.9344
9,1.7058
10,1.4991


In [20]:
trainer_stats.metrics

{'train_runtime': 120.849,
 'train_samples_per_second': 3.972,
 'train_steps_per_second': 0.496,
 'total_flos': 1651143917813760.0,
 'train_loss': 1.2928175886472066,
 'epoch': 0.048}

In [21]:
model.save_pretrained("data/models/model1")
tokenizer.save_pretrained("data/models/model1")

('data/models/model1/tokenizer_config.json',
 'data/models/model1/special_tokens_map.json',
 'data/models/model1/tokenizer.model',
 'data/models/model1/added_tokens.json',
 'data/models/model1/tokenizer.json')

In [22]:
trainer.evaluate()

{'eval_loss': 1.2255452871322632,
 'eval_runtime': 60.6954,
 'eval_samples_per_second': 24.071,
 'eval_steps_per_second': 3.015,
 'epoch': 0.048}