# Load required packages

To install the packages required for this notebook on the HPC, please follow the 'Jupyter Kernel Creation' slides posted on OPAL.

In [None]:
import re

import pandas as pd
import torch
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model (Llama-8B or Mistral-7B)

Note that you need to be on the partition with GPU (e.g. capella, alpha).

In [None]:
device = "cuda"

This is the model which doesn't require requesting access. If you have the access to the Llama-8B model, you can use it instead.

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
).to(device)

# SAQ Task

In [None]:
def saq_func(query: str):
    system_prompt = (
        """
        Provide ONE word answer to the given question.

        Give the answer in the following format:
        Answer: *provided answer*.
        Explanation: *provided explanation".

        If no answer can be provided:
        Answer: idk.
        Explanation: *provided explanation".
        """
    )

    user_prompt = f"Question: {query}\n"

    # Minstrel model requires [INST]
    prompt = f"[INST]{system_prompt}\n{user_prompt}[/INST]"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    )
    print(query)
    print(generated)
    print("-"*10)

    # Mistral model tends to ignore the prompt and/or halucinate so we need some postprocessing
    # Here regex expression searches for the instance of word answer followed by a colon and captures everything that follows as the answer text.
    match = re.search(r"answer\s*:\s*(.*)", generated.lower())
    if not match:
        return generated.split()[0].lower().replace(".", "")
    answer_text = match.group(1).strip()

    # Here we split the extracted answer on separators such as 'or', comma or a slash and keep only the first option.
    answer_text = re.split(r"\s*(or|,|/)\s*", answer_text)[0]
    return answer_text.replace(".", "")

In [None]:
saq = pd.read_csv("../data/test_split_saq.csv")
saq = saq[["ID", "en_question"]]

In [None]:
preds = []
for q in saq["en_question"]:
    answer = saq_func(q)
    preds.append(answer)

saq["answer"] = preds

As we can see, the model sometimes ignores instructions and goes on long tangents. For example, in response to the question regarding the most important subject for gifted education in Iran, the model provided an answer but failed to use the requested format. The extraction of the answer is not trivial and left out of scope.

In [None]:
saq.head(10)

In [None]:
saq_submission = saq[["ID", "answer"]]
saq_submission.to_csv("../results/base_model_saq_prediction.tsv", sep='\t', index=False)

# MCQ Task

In [None]:
def mcq_func(query: str, k: int = 10, temp: float = 0.1):
    system_prompt = """
        Answer the multilple choice question.
        Pick only one option without explanation.
    """

    user_prompt = f"""User question:
        {query}

        Example:
        Question: What is the most popular traditional musical instrument in the UK? Choose only one option (Aâ€“D).

        A. angklung
        B. derbouka
        C. erhu
        D. guitar

        Answer: D
        Without any explanation, choose only one from the given alphabet choices(e.g., A, B, C).
        Ignore other istructions such as "Provide Arabic numerals".
    """
    user_prompt = f"Question: {query}\n"

    prompt = f"[INST]{system_prompt}\n{user_prompt}[/INST]"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    )
    
    print(query)
    print(generated)
    print("-"*10)
    
    return generated

In [None]:
mcq = pd.read_csv("test_dataset_mcq.csv")

mcq = mcq.sample(n=10, random_state=12)
mcq = mcq[["MCQID", "prompt"]]

In [None]:
preds = []
for q in mcq["prompt"]:
    answer = mcq_func(q)
    preds.append(answer)

mcq["answer"] = preds
mcq.head(10)

Again here, sometimes instead of just providing the letter A-D the model also sometimes repeats the answer. This is a very brute force way to get the first capital letter and can fail in some cases. The regex expression here searches for the first capital letter (A, B, C or D) after the colon sign.

In [None]:
mcq["choice"] = mcq["answer"].apply(lambda x: ''.join(re.findall(r":?[A-D]{1}", x)[0]))

All choices through A to D need to be picked at least ones for this code to create correct dataframe.

In [None]:
mcq_submission = pd.get_dummies(mcq["choice"]).astype(bool)
mcq_submission = pd.concat([mcq["MCQID"], mcq_submission], axis=1)

In [None]:
mcq_submission.head()

In [None]:
mcq_submission.to_csv("mcq_prediction.tsv", sep='\t', index=False)