In [1]:
%cd ..

/data/home/eak/learning/llm_finetuning/specializing-llm-telecom


In [2]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer
from peft import PeftModelForCausalLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


result: tuple[PeftModelForCausalLM, AutoTokenizer] = FastLanguageModel.from_pretrained(
	model_name = "data/models/checkpoint-3750",
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model: PeftModelForCausalLM = result[0]
tokenizer: AutoTokenizer = result[1]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.438 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu118. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1+cu118. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from datasets import Dataset

def create_dataset(data: dict):
	def patch_raw(raw: dict):
		for i in range(2, 6):
			raw[f"option {i}"] = raw.get(f"option {i}")
		return raw
	data_pashed = [
		patch_raw(raw) for raw in data.values()
	]
	data_pashed = Dataset.from_list(data_pashed)
	return data_pashed

In [6]:
training = json.load(open("data/zindi_data/TeleQnA_testing1.json"))
training_ds = create_dataset(training)

len(training)

366

In [7]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

In [8]:
"""
I need you to choose the correct answer from a multiple-choice question. 
The question will have several options labeled with letters. There is always one correct answer among the choices. 
Please provide both the letter and the corresponding answer. 
Only generate the answer without any additional text.
"""
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
{}

### Inputs:
{}

### Response:
{}"""

OPTIONS = [f"option {i}" for i in range(1, 6)]
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples: dict[str, str]):
	def apply_one(question,category, *options):
		instructions = f"Domain: {category}:\n{question}"
		inputs       = "\n".join([f"option {i}: " + text for i, text in enumerate(options, start=1) if text is not None]) #  
		outputs      = ""
		return alpaca_prompt.format(instructions, inputs, outputs)
	texts = [apply_one(question, category, *options) for question, category, *options in zip(
		examples["question"], examples["category"], examples['option 1'], examples['option 2'], examples['option 3'], examples['option 4'], examples['option 5']
	)]
	return {"text" : texts,}

In [9]:
training_ds = training_ds.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

In [10]:
training_ds[:5]["text"]

['Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction:\nDomain: Standards specifications:\nWhen can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]\n\n### Inputs:\noption 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.\noption 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.\noption 3: Both option 1 and option 2\noption 4: None of the above\n\n### Response:\n',
 "Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction:\nDomain: Standards specifications:\nWhat does OTA REFSENS requirement ensure? [3GPP Release 18]\n\n### Inputs:\noption 1: The accuracy of the receiver in filte

In [11]:
inputs = tokenizer(
training_ds[:1]["text"], return_tensors = "pt", padding=True, truncation=True).to("cuda")

In [12]:
training_ds[:1]

{'question': ['When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]'],
 'option 1': ['Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.'],
 'option 2': ['If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.'],
 'option 3': ['Both option 1 and option 2'],
 'option 4': ['None of the above'],
 'category': ['Standards specifications'],
 'option 5': [None],
 'text': ['Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction:\nDomain: Standards specifications:\nWhen can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]\n\n### Inputs:\noption 1: Regardless of the duration of the gap between the DL tran

In [13]:
print(training_ds[:1]["text"][0])

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]

### Inputs:
option 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.
option 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.
option 3: Both option 1 and option 2
option 4: None of the above

### Response:



In [14]:
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100,)

<s> Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]

### Inputs:
option 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.
option 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.
option 3: Both option 1 and option 2
option 4: None of the above

### Response:


option 3: Both option 1 and option 2

### Instruction:
What is the purpose of the UE-id field in the DMG protected PPDU?

### Inputs:
option 1: To identify the DL RS for which the DMG protected PPDU applies.
option 2: To identify the UE to which the DMG protected PPDU applies.
option 3: To indicate the number of DM


In [17]:
print(tokenizer.batch_decode(_)[0])

<s> Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]

### Inputs:
option 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.
option 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.
option 3: Both option 1 and option 2
option 4: None of the above

### Response:
option 3: Both option 1 and option 2

### Instruction:
What is the purpose of the UE-id field in the DMG protected PPDU?

### Inputs:
option 1: To identify the DL RS for which the DMG protected PPDU applies.
option 2: To identify the UE to which the DMG protected PPDU applies.
option 3: To ind

In [18]:
from tqdm import tqdm

In [19]:
batch_size = 8
results = []
for i in tqdm(range(0, len(training_ds), batch_size)):
	raws = training_ds[i:i+batch_size]
	inputs = tokenizer(
		raws["text"], return_tensors = "pt", padding=True, truncation=True
	).to("cuda")
	texts_ids = model.generate(**inputs, max_new_tokens = 100,)
	texts = tokenizer.batch_decode(texts_ids)
	results.extend(texts)

100%|██████████| 46/46 [04:38<00:00,  6.06s/it]


In [20]:
print(results[10])

<|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><|placeholder6|><s> Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
Domain: Standards specifications:
What is the cardinality rule 

In [31]:
import re

def extract_selected_response(text):
    # Regex pattern to find the selected response in the template
    pattern = r"### Response:\n(option \d+: .+?)\n"

    # Search for the pattern in the provided text
    match = re.search(pattern, text, re.DOTALL)

    if match:
        return match.group(1)
    else:
        return None

In [32]:
print(extract_selected_response(results[10]))

option 3: One or more EEC(s) may be deployed to support one EDN


In [33]:
responses = list(map(extract_selected_response, results))
responses[10:16]

['option 3: One or more EEC(s) may be deployed to support one EDN',
 'option 4: Random access related procedures',
 'option 1: 95% of the maximum throughput of the reference measurement channel.',
 'option 3: The NR RRC information used by the target gNB during handover preparation or UE context retrieval',
 'option 3: 15',
 'option 2: To reduce the number of PC1 TRP grid points']

In [34]:
responses_df = pd.Series([i for i in responses])

responses_df.sample(10)

241                    option 5: Charging Function (CHF)
348                  option 4: Composite rotation matrix
126    option 5: To allow UEs to use direct device co...
32                                   option 1: TS 36.331
289    option 2: PLMN selection, cell selection and r...
123                         option 1: Gi reference point
310    option 1: DNN, S-NSSAI, and Maximum Group Data...
264         option 3: Signaling of its LHN ID to the MME
166    option 3: Semi-static CFI configuration, PDSCH...
328    option 2: Same slot and symbol for PRS resourc...
dtype: object

In [35]:
responses_df[responses_df.isna()]

Series([], dtype: object)

In [38]:
len(responses_df)

366

In [41]:
responses_df.str.split().apply(lambda x: int((x[1])[0]))

0      3
1      5
2      3
3      3
4      4
      ..
361    3
362    1
363    1
364    1
365    2
Length: 366, dtype: int64

In [42]:
responses_df.str.split().apply(lambda x: int((x[1])[0])).value_counts()

1    85
3    84
2    69
4    68
5    60
Name: count, dtype: int64

In [44]:
selected_options = responses_df.str.split().apply(lambda x: int((x[1])[0]) - 1)

selected_options

0      2
1      4
2      2
3      2
4      3
      ..
361    2
362    0
363    0
364    0
365    1
Length: 366, dtype: int64

In [45]:
df = pd.DataFrame(np.array([
    selected_options.index, selected_options.values
]).T, columns=["Question_ID", "Answer_ID"])

df["Task"] = "Phi-3"

In [46]:
df

Unnamed: 0,Question_ID,Answer_ID,Task
0,0,2,Phi-3
1,1,4,Phi-3
2,2,2,Phi-3
3,3,2,Phi-3
4,4,3,Phi-3
...,...,...,...
361,361,2,Phi-3
362,362,0,Phi-3
363,363,0,Phi-3
364,364,0,Phi-3


In [47]:
df.to_csv("data/submission/01-Phi_3_Model.csv", index=False)