In [1]:
%cd ..

/data/home/eak/learning/llm_finetuning/specializing-llm-telecom


In [2]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer
from peft import PeftModelForCausalLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


result: tuple[PeftModelForCausalLM, AutoTokenizer] = FastLanguageModel.from_pretrained(
	model_name = "data/models/checkpoint-3750",
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model: PeftModelForCausalLM = result[0]
tokenizer: AutoTokenizer = result[1]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.438 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu118. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1+cu118. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from datasets import Dataset

def create_dataset(data: dict):
	def patch_raw(raw: dict):
		for i in range(2, 6):
			raw[f"option {i}"] = raw.get(f"option {i}")
		return raw
	data_pashed = [
		patch_raw(raw) for raw in data.values()
	]
	data_pashed = Dataset.from_list(data_pashed)
	return data_pashed

In [6]:
training = json.load(open("data/zindi_data/TeleQnA_training.json"))
training_ds = create_dataset(training)

len(training)

1461

In [7]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

In [8]:
"""
I need you to choose the correct answer from a multiple-choice question. 
The question will have several options labeled with letters. There is always one correct answer among the choices. 
Please provide both the letter and the corresponding answer. 
Only generate the answer without any additional text.
"""
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
{}

### Inputs:
{}

### Response:
{}"""

OPTIONS = [f"option {i}" for i in range(1, 6)]
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples: dict[str, str]):
	def apply_one(question,category, *options):
		instructions = f"Domain: {category}:\n{question}"
		inputs       = "\n".join([f"option {i}: " + text for i, text in enumerate(options, start=1) if text is not None]) #  
		outputs      = ""
		return alpaca_prompt.format(instructions, inputs, outputs)
	texts = [apply_one(question, category, *options) for question, category, *options in zip(
		examples["question"], examples["category"],  examples['option 1'], examples['option 2'], examples['option 3'], examples['option 4'], examples['option 5']
	)]
	return { "text" : texts, }

In [9]:
training_ds = training_ds.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1461 [00:00<?, ? examples/s]

In [10]:
training_ds[:5]["text"]

['Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction:\nDomain: Standards specifications:\nWhat is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]\n\n### Inputs:\noption 1: To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints\noption 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints\noption 3: To supply data or analytics from the MFAF to notification endpoints\noption 4: To fetch data or analytics from the MFAF based on fetch instructions\n\n### Response:\n',
 'Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction:\nDomain: Standards specifications:\nHow does a supporting UE attach to the same core network operator from which it detached in a shared network? [3GPP Release 17]\n\n### Inputs:\noption 1

In [11]:
inputs = tokenizer(
training_ds[:1]["text"], return_tensors = "pt", padding=True, truncation=True).to("cuda")

In [12]:
training_ds[:1]

{'question': ['What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]'],
 'option 1': ['To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints'],
 'option 2': ['To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints'],
 'option 3': ['To supply data or analytics from the MFAF to notification endpoints'],
 'option 4': ['To fetch data or analytics from the MFAF based on fetch instructions'],
 'answer': ['option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints'],
 'explanation': ['The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.'],
 'category': ['Standards specifications'],
 'option 5': [None],
 'text': ['Below is an instruction that describes a task, pair

In [13]:
print(training_ds[:1]["text"][0])

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]

### Inputs:
option 1: To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints
option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints
option 3: To supply data or analytics from the MFAF to notification endpoints
option 4: To fetch data or analytics from the MFAF based on fetch instructions

### Response:



In [14]:
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 25,)

<s> Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]

### Inputs:
option 1: To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints
option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints
option 3: To supply data or analytics from the MFAF to notification endpoints
option 4: To fetch data or analytics from the MFAF based on fetch instructions

### Response:
option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out


In [15]:
inputs = tokenizer(
training_ds[[1000]]["text"], return_tensors = "pt", padding=True, truncation=True).to("cuda")
training_ds[1000]["answer"]

'option 3: If the cell is barred for connectivity to EPC'

In [16]:
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 25,)

<s> Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
In the context of cell status and cell reservations, what does the field 'cellBarred' indicate? [3GPP Release 17]

### Inputs:
option 1: If the cell is barred for connectivity to 5GC
option 2: If the cell is reserved for operator use
option 3: If the cell is barred for connectivity to EPC
option 4: If the cell supports network-based CRS interference mitigation
option 5: If the cell is barred for emergency calls

### Response:
option 1: If the cell is barred for connectivity to 5GC

### Instruction:



In [17]:
inputs = tokenizer(
training_ds[[1200]]["text"], return_tensors = "pt", padding=True, truncation=True).to("cuda")
training_ds[1200]["answer"]

'option 5: To provide information such as data forwarding addresses and new SN security key.'

In [18]:
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 25,)

<s> Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
What is the purpose of the SGNB MODIFICATION REQUIRED message in the inter-gNB-DU mobility procedure using MCG SRB in EN-DC? [3GPP Release 17]

### Inputs:
option 1: To query the latest SCG configuration from the gNB-CU.
option 2: To migrate the IPsec tunnels to new IP outer addresses.
option 3: To perform RRC Connection Reconfiguration at the MeNB and UE.
option 4: To configure BH RLC channels and BAP-layer route entries on the target path.
option 5: To provide information such as data forwarding addresses and new SN security key.

### Response:
option 5: To provide information such as data forwarding addresses and new SN security key.

### Instruction
