In [None]:
!pip install -q gdown

In [None]:
BASE_PATH = "./SE2024"
INFERENCE_DATASET_PATH = f"{BASE_PATH}/test_split_data.csv"
SHOTS_DATASET_PATH = f"{BASE_PATH}/shots_data.jsonl"
OUTPUT_DATA_PATH = f"{BASE_PATH}/inference_data.jsonl"
OUTPUT_BACKUP_PATH = f"{BASE_PATH}/inference_data_backup.jsonl"

In [None]:
import os
os.makedirs(BASE_PATH, exist_ok=True)

# Prepare datasets

In [None]:
import json

def read_jsonl(address):
    json_list = []
    with open(address, 'r') as file:
        for line in file:
            data = json.loads(line)
            json_list.append(data)
    return json_list

In [None]:
import pandas as pd

def handle_missing_data(data_path, data_id):
    if os.path.exists(data_path):
        print('Data file already exists')
    else:
        print("Data doesn't exist, start download from the google drive...")
        !gdown $data_id -O $data_path

In [None]:
handle_missing_data(INFERENCE_DATASET_PATH, "1JcpBjTXv2OfaG6uYcIJO-Yk69nT9uN8i")
inference_data = pd.read_csv(INFERENCE_DATASET_PATH)

In [None]:
handle_missing_data(SHOTS_DATASET_PATH, "1Byb7jvUfdmIWt39Wkmr69ygmUlGrEHEJ") # 11-simple-relation
# handle_missing_data(SHOTS_DATASET_PATH, "1mxzecUyXIbE891E2m5mvotLC4k7VSaUH") # 12-ranker-relation
# handle_missing_data(SHOTS_DATASET_PATH, "1skMfKvlN7pTKdqIKr16IZV0tawDaTGz_") # 11-simple-summarize
# handle_missing_data(SHOTS_DATASET_PATH, "1pGVJKDQSjqHpwqsivHonWneE918jXt31") # 12-ranker-summarize
shots_data = read_jsonl(SHOTS_DATASET_PATH)

# Prompt Template

In [None]:
prompt_template = """\
<|Instruction|>
Decode a riddle from four options using everyday logic and creativity. \
Riddles may involve misdirection, double meanings, metaphorical wordplay, exaggeration, common phrases, associations, irony, numerical puzzles, and elemental imagery. \
Avoid gender bias. \
Keep solutions within realistic imagination and avoid supernatural elements.
I would provide you similar Riddle-Answer pair and the logic behind them.
Lets think step by step for each of the options and at the end, \
provide the best option in the format 'Option 1,' 'Option 2,' 'Option 3,' or 'Option 4.'

<|Samples|>
{examples}

<|Problem|>
Riddle: "{riddle}"
Option 1: "{option_1}"
Option 2: "{option_2}"
Option 3: "{option_3}"
Option 4: "None of the above options are correct"

Let's think step by step about each option, then at the end, choose the best and the most logical option:
"""

def get_prompt(question, samples):
    examples = ""
    for k,v in samples.items():
        if "shot" in k:
            examples += v + "\n"
    return prompt_template.format(
        riddle=question['QUESTION'],
        option_1=question['OPTION 1'],
        option_2=question['OPTION 2'],
        option_3=question['OPTION 3'],
        examples=examples
    )

# Prepare LLM

In [None]:
!pip install -q bitsandbytes transformers accelerate torch
!pip install -q safetensors xformers langchain

In [None]:
model_id = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        use_cache=True,
        device_map="auto",
        max_length=4000,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [None]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': 0.0})

# Inference Utils

In [None]:
def save_inference(data, address):
    with open(address, 'w') as jsonl_file:
        for item in data:
            jsonl_file.write(json.dumps(item) + '\n')
            
def add_inference(data, address):
    with open(address, 'a+') as jsonl_file:
        for item in data:
            jsonl_file.write(json.dumps(item) + '\n')

# Execute Inference

In [None]:
from tqdm.notebook import tqdm

results = []
itr = tqdm(zip(inference_data.iterrows(), shots_data), total=len(inference_data), desc="Processing")

for (index, ds), shot in itr:
    prompt = get_prompt(ds, shot)
    result = llm.invoke(prompt)

    data = {
        "question": ds['QUESTION'],
        'option 1': ds['OPTION 1'],
        'option 2': ds['OPTION 2'],
        'option 3': ds['OPTION 3'],
        'option 4': ds['OPTION 4'],
        'zephyr': result
    }
    add_inference([data], OUTPUT_DATA_PATH)
    results.append(data)
    
save_inference(results, OUTPUT_BACKUP_PATH)

print(f"Dumped {len(results)} records to {OUTPUT_DATA_PATH}")