In [None]:
%cd ..

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

MODEL_PATH = "data/models/All/checkpoint-2167"
TASK_ID = "03"

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer
from peft import PeftModelForCausalLM

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


result: tuple[PeftModelForCausalLM, AutoTokenizer] = FastLanguageModel.from_pretrained(
	model_name = MODEL_PATH,
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model: PeftModelForCausalLM = result[0]
tokenizer: AutoTokenizer = result[1]

In [None]:
from datasets import Dataset

def create_dataset(data: dict):
	def patch_raw(raw: dict):
		for i in range(2, 6):
			raw[f"option {i}"] = raw.get(f"option {i}")
		return raw
	data_pashed = [
		patch_raw(raw) for raw in data.values()
	]
	data_pashed = Dataset.from_list(data_pashed)
	return data_pashed

In [None]:
training = json.load(open("data/zindi_data/TeleQnA_testing1.json"))
training_ds = create_dataset(training)

len(training)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

In [None]:
"""
I need you to choose the correct answer from a multiple-choice question. 
The question will have several options labeled with letters. There is always one correct answer among the choices. 
Please provide both the letter and the corresponding answer. 
Only generate the answer without any additional text.
"""
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
{}

### Inputs:
{}

### Response:
{}"""

OPTIONS = [f"option {i}" for i in range(1, 6)]
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples: dict[str, str]):
	def apply_one(question,category, *options):
		instructions = f"Domain: {category}:\n{question}"
		inputs       = "\n".join([f"option {i}: " + text for i, text in enumerate(options, start=1) if text is not None]) #  
		outputs      = ""
		return alpaca_prompt.format(instructions, inputs, outputs)
	texts = [apply_one(question, category, *options) for question, category, *options in zip(
		examples["question"], examples["category"], examples['option 1'], examples['option 2'], examples['option 3'], examples['option 4'], examples['option 5']
	)]
	return {"text" : texts,}

In [None]:
training_ds = training_ds.map(formatting_prompts_func, batched=True)

In [None]:
training_ds[:5]["text"]

In [None]:
inputs = tokenizer(
training_ds[:1]["text"], return_tensors = "pt", padding=True, truncation=True).to("cuda")

In [None]:
training_ds[:1]

In [None]:
print(training_ds[:1]["text"][0])

In [None]:
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100,)

In [None]:
print(tokenizer.batch_decode(_)[0])

In [None]:
from tqdm import tqdm

In [None]:
batch_size = 8
results = []
for i in tqdm(range(0, len(training_ds), batch_size)):
	raws = training_ds[i:i+batch_size]
	inputs = tokenizer(
		raws["text"], return_tensors = "pt", padding=True, truncation=True
	).to("cuda")
	texts_ids = model.generate(**inputs, max_new_tokens = 100,)
	texts = tokenizer.batch_decode(texts_ids)
	results.extend(texts)

In [None]:
print(results[10])

In [None]:
import re

def extract_selected_response(text):
    # Regex pattern to find the selected response in the template
    pattern = r"### Response:\n(option \d+: .+?)\n"

    # Search for the pattern in the provided text
    match = re.search(pattern, text, re.DOTALL)

    if match:
        return match.group(1)
    else:
        return None

In [None]:
print(extract_selected_response(results[10]))

In [None]:
responses = list(map(extract_selected_response, results))
responses[10:16]

In [None]:
responses_df = pd.Series([i for i in responses])

responses_df.sample(10)

In [None]:
responses_df[responses_df.isna()]

In [None]:
len(responses_df)

In [None]:
responses_df.fillna("option 0:", inplace=True)

In [None]:
responses_df.apply(lambda x: str(x).startswith("option ")).sum()

In [None]:
responses_df.str.split().apply(lambda x: int((x[1])[0]))

In [None]:
responses_df.str.split().apply(lambda x: int((x[1])[0])).value_counts()

In [None]:
selected_options = responses_df.str.split().apply(lambda x: int((x[1])[0]) - 1)

selected_options

In [None]:
df = pd.DataFrame(np.array([
    selected_options.index, selected_options.values
]).T, columns=["Question_ID", "Answer_ID"])

df["Task"] = "Phi-3"

In [None]:
df

In [None]:
df.to_csv(f"data/submission/{TASK_ID}-Phi_3_Model.csv", index=False)