In [1]:
%cd ..

/data/home/eak/learning/llm_finetuning/specializing-llm-telecom


In [2]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [4]:
import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

MODEL_PATH = "data/models/generated/checkpoint-8000"
TASK_ID = "04"

In [5]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer
from peft import PeftModelForCausalLM

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [6]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


result: tuple[PeftModelForCausalLM, AutoTokenizer] = FastLanguageModel.from_pretrained(
	model_name = MODEL_PATH,
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model: PeftModelForCausalLM = result[0]
tokenizer: AutoTokenizer = result[1]

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.438 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.1+cu118. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.23+cu118. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
from datasets import Dataset

def create_dataset(data: dict):
	def patch_raw(raw: dict):
		for i in range(2, 6):
			raw[f"option {i}"] = raw.get(f"option {i}")
		return raw
	data_pashed = [
		patch_raw(raw) for raw in data.values()
	]
	data_pashed = Dataset.from_list(data_pashed)
	return data_pashed

In [8]:
training = json.load(open("data/zindi_data/TeleQnA_testing1.json"))
training_ds = create_dataset(training)

len(training)

366

In [9]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

In [10]:
alpaca_prompt = """You are an AI assistant specialized in the 3rd Generation Partnership Project (3GPP) Release specifications. The Standards Development: 3GPP is known for creating standards for 3G (third generation), 4G (fourth generation), and 5G (fifth generation) mobile networks. 
Your task is to analyze questions and select the most appropriate answer from given options based on your knowledge of 3GPP standards.

<instruction>
{}
</instruction>

<options>
{}
</options>

<reponse>"""

OPTIONS = [f"option {i}" for i in range(1, 6)]
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples: dict[str, str]):
	def apply_one(question, *options):
		instructions = f"{question}"
		inputs       = "\n".join([f"option {i}: " + text for i, text in enumerate(options, start=1) if text is not None]) # 
		return alpaca_prompt.format(instructions, inputs)
	texts = [apply_one(question, *options) for question, *options in zip(
		examples["question"], examples['option 1'], examples['option 2'], examples['option 3'], examples['option 4'], examples['option 5']
	)]
	return {"text" : texts,}

In [11]:
training_ds = training_ds.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

In [12]:
training_ds[:5]["text"]

['You are an AI assistant specialized in the 3rd Generation Partnership Project (3GPP) Release specifications. The Standards Development: 3GPP is known for creating standards for 3G (third generation), 4G (fourth generation), and 5G (fifth generation) mobile networks. \nYour task is to analyze questions and select the most appropriate answer from given options based on your knowledge of 3GPP standards.\n\n<instruction>\nWhen can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]\n</instruction>\n\n<options>\noption 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.\noption 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.\noption 3: Both option 1 and option 2\noption 4: None of the above\n</options>\n\n<repo

In [13]:
inputs = tokenizer(
training_ds[:1]["text"], return_tensors = "pt", padding=True, truncation=True).to("cuda")

In [14]:
training_ds[:1]

{'question': ['When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]'],
 'option 1': ['Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.'],
 'option 2': ['If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.'],
 'option 3': ['Both option 1 and option 2'],
 'option 4': ['None of the above'],
 'category': ['Standards specifications'],
 'option 5': [None],
 'text': ['You are an AI assistant specialized in the 3rd Generation Partnership Project (3GPP) Release specifications. The Standards Development: 3GPP is known for creating standards for 3G (third generation), 4G (fourth generation), and 5G (fifth generation) mobile networks. \nYour task is to analyze questions and select the most appropriate answer from given

In [15]:
print(training_ds[:1]["text"][0])

You are an AI assistant specialized in the 3rd Generation Partnership Project (3GPP) Release specifications. The Standards Development: 3GPP is known for creating standards for 3G (third generation), 4G (fourth generation), and 5G (fifth generation) mobile networks. 
Your task is to analyze questions and select the most appropriate answer from given options based on your knowledge of 3GPP standards.

<instruction>
When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]
</instruction>

<options>
option 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.
option 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.
option 3: Both option 1 and option 2
option 4: None of the above
</options>

<reponse>


In [16]:
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100,)

You are an AI assistant specialized in the 3rd Generation Partnership Project (3GPP) Release specifications. The Standards Development: 3GPP is known for creating standards for 3G (third generation), 4G (fourth generation), and 5G (fifth generation) mobile networks. 
Your task is to analyze questions and select the most appropriate answer from given options based on your knowledge of 3GPP standards.

<instruction>
When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]
</instruction>

<options>
option 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.
option 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.
option 3: Both option 1 and option 2
option 4: None of the above
</options>

<reponse>
option 3: B

In [17]:
print(tokenizer.batch_decode(_)[0])

You are an AI assistant specialized in the 3rd Generation Partnership Project (3GPP) Release specifications. The Standards Development: 3GPP is known for creating standards for 3G (third generation), 4G (fourth generation), and 5G (fifth generation) mobile networks. 
Your task is to analyze questions and select the most appropriate answer from given options based on your knowledge of 3GPP standards.

<instruction>
When can a gNB transmit a DL transmission(s) on a channel after initiating a channel occupancy? [3GPP Release 17]
</instruction>

<options>
option 1: Regardless of the duration of the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB.
option 2: If the gap between the DL transmission(s) and any previous transmission(s) corresponding to the channel occupancy initiated by the gNB is more than a threshold.
option 3: Both option 1 and option 2
option 4: None of the above
</options>

<reponse>
option 3: B

In [18]:
from tqdm import tqdm

In [19]:
batch_size = 8
results = []
for i in tqdm(range(0, len(training_ds), batch_size)):
	raws = training_ds[i:i+batch_size]
	inputs = tokenizer(
		raws["text"], return_tensors = "pt", padding=True, truncation=True
	).to("cuda")
	texts_ids = model.generate(**inputs, max_new_tokens = 100,)
	texts = tokenizer.batch_decode(texts_ids)
	results.extend(texts)

 15%|â–ˆâ–Œ        | 7/46 [00:40<03:40,  5.66s/it]

In [None]:
print(results[10])

In [None]:
import re

def extract_selected_response(text):
    # Regex pattern to find the selected response in the template
    pattern = r"<reponse>\n(option \d+: .+?)\n"

    # Search for the pattern in the provided text
    match = re.search(pattern, text, re.DOTALL)

    if match:
        return match.group(1)
    else:
        return None

In [None]:
print(extract_selected_response(results[10]))

In [None]:
responses = list(map(extract_selected_response, results))
responses[10:16]

In [None]:
responses_df = pd.Series([i for i in responses])

responses_df.sample(10)

In [None]:
responses_df[responses_df.isna()]

In [None]:
len(responses_df)

In [None]:
responses_df.fillna("option 0:", inplace=True)

In [None]:
responses_df.apply(lambda x: str(x).startswith("option ")).sum()

In [None]:
responses_df.str.split().apply(lambda x: int((x[1])[0]))

In [None]:
responses_df.str.split().apply(lambda x: int((x[1])[0])).value_counts()

In [None]:
selected_options = responses_df.str.split().apply(lambda x: int((x[1])[0]) - 1)

selected_options

In [None]:
df = pd.DataFrame(np.array([
    selected_options.index, selected_options.values
]).T, columns=["Question_ID", "Answer_ID"])

df["Task"] = "Phi-3"

In [None]:
df

In [None]:
df.to_csv(f"data/submission/{TASK_ID}-Phi_3_Generated.csv", index=False)

In [None]:
f"data/submission/{TASK_ID}-Phi_3_Generated.csv"