In [None]:
!pip install -q gdown

In [None]:
BASE_PATH = "./SE2024"
INFERENCE_DATASET_PATH = f"{BASE_PATH}/test_split_data.csv"

OUTPUT_DATA_PATH = f"{BASE_PATH}/inference_data.jsonl"
OUTPUT_BACKUP_PATH = f"{BASE_PATH}/inference_data_backup.jsonl"

In [None]:
import os
os.makedirs(BASE_PATH, exist_ok=True)

# Prepare dataset

In [None]:
import pandas as pd

def handle_missing_data(data_path, data_id):
    if os.path.exists(data_path):
        print('Data file already exists')
    else:
        print("Data doesn't exist, start download from the google drive...")
        !gdown $data_id -O $data_path

In [None]:
handle_missing_data(INFERENCE_DATASET_PATH, "1JcpBjTXv2OfaG6uYcIJO-Yk69nT9uN8i")
inference_data = pd.read_csv(INFERENCE_DATASET_PATH)

# Prompt Template

In [None]:
prompt_template = """\
Your task is to generate a descriptive explanation from a question to an answer option. \
In the following, a question and an option as the answer to the question is provided. \
The answer might be or not be a correct answer. \
Write a descriptive explanation in at most one paragraph and 200 words to show that path from question to the answer.

Question: "{question}"
Answer Option: "{option}"
"""

def get_prompt(question):
    return [prompt_template.format(question=question['QUESTION'], option=question['OPTION 1']),
            prompt_template.format(question=question['QUESTION'], option=question['OPTION 2']),
            prompt_template.format(question=question['QUESTION'], option=question['OPTION 3'])]

In [None]:
decision_template = """ \
You are given a riddle and four options to choose the answer amongst them. \
The fourth option is "None of the above options". \
Your final task is choosing the best option that is related to the riddle. \
For the first three options, you are given a context that explains a path between the question and the answer. \
Although these contexts may try to say their option is true, you should compare all the options based on the question \
and options' context to choose the one that has the most logical answer. If none of them seem logical, 
choose the fourth option: "None of the above options." \
Now, consider the riddle below and the context provided for you, and tell me which option is \
the best answer to the riddle due to the context. \

Riddle: "{question}"

Options:
Option 1: "{option 1}"
Option 2: "{option 2}"
Option 3: "{option 3}"
Option 4: "None of the above options."

Contexts:
Context about option 1: "{context 1}"
Context about option 2: "{context 2}"
Context about option 3: "{context 3}"

To answer this riddle, you should exactly mention one option, \
so announce the option you think is the best one in the format: 'Option 1' or 'Option 2' or 'Option 3' or 'Option 4':
"""

In [None]:
import numpy as np

def extract_answer(result: str):
    o1 = result.rfind("Option 1")
    o2 = result.rfind("Option 2")
    o3 = result.rfind("Option 3")
    o4 = result.rfind("Option 4")

    answer = np.argmax([o1, o2, o3, o4])
    return answer

# Prepare LLM

In [None]:
!pip install -q transformers accelerate torch
!pip install -q safetensors xformers langchain

In [None]:
model_id = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        use_cache=True,
        device_map="auto",
        max_length=4000,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [None]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': 0.0})

# Inference Utils

In [None]:
import json

def save_inference(data, address):
    with open(address, 'w') as jsonl_file:
        for item in data:
            jsonl_file.write(json.dumps(item) + '\n')
            
def add_inference(data, address):
    with open(address, 'a+') as jsonl_file:
        for item in data:
            jsonl_file.write(json.dumps(item) + '\n')

# Execute Inference

In [None]:
from tqdm.notebook import tqdm

results = []
itr = tqdm(inference_data.iterrows(), total=len(inference_data), desc="Processing")

for index, ds in itr:
    data = {
        "question": ds['QUESTION'],
        'option 1': ds['OPTION 1'],
        'option 2': ds['OPTION 2'],
        'option 3': ds['OPTION 3'],
    }
    context_prompts = get_prompt(ds)
    for i, context_prompt in enumerate(context_prompts, start=1):
        data[f'context {i}'] = llm.invoke(context_prompt).strip()
    result_prompt = decision_template.format(**data)
    data['option 4'] = ds['OPTION 4']
    result = llm.invoke(result_prompt).strip()
    data['zephyr'] = extract_answer(result)
    
    add_inference([data], OUTPUT_DATA_PATH)
    results.append(data)
    
save_inference(results, OUTPUT_BACKUP_PATH)

print(f"Dumped {len(results)} records to {OUTPUT_DATA_PATH}")