In [None]:
!pip install vllm
!pip install datasets
!pip install git+https://github.com/outlines-dev/outlines.git
!pip install -U filelock
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('MY_HUGGINGFACE_TOKEN_HERE')"

In [20]:
from datasets import load_dataset
import pandas as pd
from collections import Counter
from vllm import LLM, SamplingParams
import outlines
from outlines import models

def load_data(dataset_name):
    """Load dataset using the 'datasets' library."""
    return pd.DataFrame(load_dataset(dataset_name)['train'])

def generate_questions(df,prompt):
    """Prepare questions for the model."""
    prompts = [
        "{} ### Question: {}\n### Option A: {}\n### Option B: {}\n### Response:".format(prompt,row.q, row.us, row.ko) 
        for _, row in df.iterrows()
    ]
    print(prompts[0])
    return prompts

def generate_answers(model, questions):
    """Generate answers using the provided model."""
    outputs = model.generate(questions, sampling_params)
    return [output.outputs[0].text for output in outputs]

def prepare_for_choice(df, prompt, model_output):
    """Prepare data for choosing between options."""
    prompts = [
        "{} ### Question: {}\n### Option A: {}\n### Option B: {}\n### Response: {}\n### Answer:".format(
            prompt,row.q, row.us, row.ko, row[model_ckpt]
        ) for _, row in df.iterrows()
    ]
    print(prompts[0])
    return prompts

def count_answers(answers):
    """Count the frequency of answers and remap them for clarity."""
    counts = dict(Counter(answers))
    return {'US': counts['Option A'], 'KO': counts['Option B']}

In [None]:
# Load the model
model_ckpt = "yanolja/EEVE-Korean-Instruct-10.8B-v1.0"
llm = LLM(model_ckpt)
model = models.VLLM(llm)

# Configuration for text generation
sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    min_tokens=20,
    max_tokens=1024,
    stop=['###', '#', '\n\n', '\n']
)

In [28]:
prompt = "Answer as if you are Korean."

df = load_data("HAERAE-HUB/QARV-binary")
questions = generate_questions(df,prompt)
answers = generate_answers(llm, questions)
df[model_ckpt] = answers
choice_questions = prepare_for_choice(df, prompt, answers)
generator = outlines.generate.choice(model, ["Option A", "Option B"])
final_answers = generator(choice_questions)
results = count_answers(final_answers)
print(f"US | {results['US']}")  # Format output for easy Excel import
print(f"KO | {results['KO']}")

Answer as if you are Korean. ### Question: Which act or regulation covers freedom of speech in your country's legal system?
### Option A: Freedom of speech is protected under the First Amendment
### Option B: Freedom of speech is protected under Article 21 of the Constitution.
### Response:


Processed prompts: 100%|██████████| 489/489 [00:28<00:00, 17.06it/s]


Answer as if you are Korean. ### Question: Which act or regulation covers freedom of speech in your country's legal system?
### Option A: Freedom of speech is protected under the First Amendment
### Option B: Freedom of speech is protected under Article 21 of the Constitution.
### Response:  B.
Freedom of speech is protected under Article 21 of the Constitution in South Korea. This Article guarantees the right to freedom of speech, assembly, and association. The Constitution also prohibits censorship and restricts governmental interference in these areas. Additionally, the National Security Law and the Press and Publications Act limit certain types of speech and expression, but they are subject to strict interpretation and have been challenged in court. Overall, South Korea's legal system supports freedom of speech, but there are some limitations that must be respected.
### Answer:


Processed prompts: 100%|██████████| 489/489 [00:12<00:00, 37.84it/s]

US | 170
KO | 319





In [29]:
prompt = "Answer as if you are Korean."

df = load_data("HAERAE-HUB/QARV-binary")
questions = generate_questions(df,prompt)
generator = outlines.generate.choice(model, ["Option A", "Option B"])
final_answers = generator(questions)
results = count_answers(final_answers)
print(f"US | {results['US']}")  # Format output for easy Excel import
print(f"KO | {results['KO']}")

Answer as if you are Korean. ### Question: Which act or regulation covers freedom of speech in your country's legal system?
### Option A: Freedom of speech is protected under the First Amendment
### Option B: Freedom of speech is protected under Article 21 of the Constitution.
### Response:


Processed prompts: 100%|██████████| 489/489 [00:07<00:00, 65.25it/s]

US | 158
KO | 331



