In [None]:
!pip install -q gdown

In [None]:
BASE_PATH = "./SE2024"
INFERENCE_DATASET_PATH = f"{BASE_PATH}/test_split_data.csv"

OUTPUT_DATA_PATH = f"{BASE_PATH}/inference_data.jsonl"
OUTPUT_BACKUP_PATH = f"{BASE_PATH}/inference_data_backup.jsonl"

In [None]:
import os
os.makedirs(BASE_PATH, exist_ok=True)

# Prepare dataset

In [None]:
import pandas as pd

def handle_missing_data(data_path, data_id):
    if os.path.exists(data_path):
        print('Data file already exists')
    else:
        print("Data doesn't exist, start download from the google drive...")
        !gdown $data_id -O $data_path

In [None]:
handle_missing_data(INFERENCE_DATASET_PATH, "1JcpBjTXv2OfaG6uYcIJO-Yk69nT9uN8i")
inference_data = pd.read_csv(INFERENCE_DATASET_PATH)

# Prompt Template

In [None]:
prompt_template = """\
You are given a riddle and four options to choose the answer amongst them. \
A riddle is a question or statement intentionally phrased so as to require ingenuity in ascertaining its answer or meaning. \
Different ideas can be used in riddles to trick you:
    1. Riddles often employ misdirection, leading you away from the actual solution.
    2. They include elements with double meanings, requiring a keen eye for words with dual interpretations.
    3. Metaphorical wordplay adds another layer, urging you to decipher figurative language.
    4. Look out for exaggeration, as riddles may present overly dramatic details to divert your attention.
    5. Common phrases and sayings may hide within the puzzle, demanding familiarity.
    6. Associations and irony play a crucial role, introducing unexpected connections.
    7. Numerical puzzles can also be part of the mystery, requiring you to decode their significance.
    8. Elemental imagery, drawn from nature, might hold key descriptors.
    9. Rhyming and sound clues can add a poetic dimension.
    10. Avoid sexism and sex cliché, for example, gender bias for jobs, based on their positions or their outcome.
    11. Riddle may try to present something impossible or in contradiction with the reality. Just consider alternative perspectives.
Although a clever solution is required, avoid supernatural solutions and keep your answer within the limits of realistic imagination. \
For example, having superhuman abilities or unusual events or things are mostly a not preferred choice unless that is a better solution. \
Now which of the following options is the answer to the following riddle:

Riddle: "{riddle}"

Options:
Option 1: "{option_1}"
Option 2: "{option_2}"
Option 3: "{option_3}"
Option 4: "None of the above options are correct"


Let's think step by step. You need to solve the problem in two steps, \
in which in the first step, you consider each option with the riddle individually and thinking if t could be a feasible answer to the riddle; \
In the second step, you need to compare the options, considering context you provided in the previous step and choose the best one as the answer of the riddle, \
and announce the option you think is the best one in the format: 'Option 1' or 'Option 2' or 'Option 3' or 'Option 4':
"""

def get_prompt(question):
    return prompt_template.format(
        riddle=question['QUESTION'],
        option_1=question['OPTION 1'],
        option_2=question['OPTION 2'],
        option_3=question['OPTION 3'],
    )

# Prepare LLM

In [None]:
!pip install -q transformers accelerate torch
!pip install -q safetensors xformers langchain==0.1.6

In [None]:
model_id = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        use_cache=True,
        device_map="auto",
        max_length=4000,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [None]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': 0.0})

# Inference Utils

In [None]:
import json

def save_inference(data, address):
    with open(address, 'w') as jsonl_file:
        for item in data:
            jsonl_file.write(json.dumps(item) + '\n')
            
def add_inference(data, address):
    with open(address, 'a+') as jsonl_file:
        for item in data:
            jsonl_file.write(json.dumps(item) + '\n')

# Execute Inference

In [None]:
from tqdm.notebook import tqdm

results = []
itr = tqdm(inference_data.iterrows(), total=len(inference_data), desc="Processing")

for index, ds in itr:
    prompt = get_prompt(ds)
    result = llm.invoke(prompt)

    data = {
        "question": ds['QUESTION'],
        'option 1': ds['OPTION 1'],
        'option 2': ds['OPTION 2'],
        'option 3': ds['OPTION 3'],
        'option 4': ds['OPTION 4'],
        'zephyr': result
    }
    add_inference([data], OUTPUT_DATA_PATH)
    results.append(data)
    
save_inference(results, OUTPUT_BACKUP_PATH)

print(f"Dumped {len(results)} records to {OUTPUT_DATA_PATH}")