In [2]:
import json
import random
import re
import os
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from typing import Dict

  torch.utils._pytree._register_pytree_node(


In [29]:

LETTER_TO_INDEX = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

class Example:
    def __init__(self, question: str, choice1: str, choice2: str, choice3: str, choice4: str):
        self.question = question
        self.choice1 = choice1
        self.choice2 = choice2
        self.choice3 = choice3
        self.choice4 = choice4

# def _load_from_json(path: str) -> Dict:
#     """Load json from a file."""
#     with open(path, 'r') as f:
#         return json.load(f)

def _base_prompt(example: Example) -> str:
    """Creates a zero-shot prompt given a single example. Uses the prompt format from this paper on Scalable Oversight:
    https://arxiv.org/abs/2211.03540"""
    #prompt = f"What is the correct answer to this question: {example.question}"
    prompt = f"""You are a smart law assistant. You are given a question and four possible answers. Your task is to choose the most likely answer and respond with only the option letter (A, B, C, D): \nQuestion: {example.question}"""
    prompt += f"\n\nChoices:\n(A) {example.choice1}\n(B) {example.choice2}\n(C) {example.choice3}\n(D) {example.choice4}"
    return prompt

# def _generate_prompt_from_examples(json_data, with_explanations=True):
#     output = ""
#     for q in json_data["questions"]:
#         output += f'\nQuestion: {q["question"]}\nChoices:\n'
#         for choice, value in q["choices"].items():
#             output += f'({choice}) {value}\n'

#         if with_explanations:
#             output += f"Let's think step by step: \n{q['explanation']}\n"
#         output += f'The correct answer is ({q["correct_answer"]})\n'
#     return output

def zero_shot_chain_of_thought_prompt(example: Example, pipe) -> str: # no reasoning
    # """Creates a chain-of-thought prompt given a single example."""
    prompt = _base_prompt(example)
    prompt += "\nLet's think step by step: "
    cot_output = pipe(
                example.question,
                max_new_tokens=256,
                pad_token_id=pipe.tokenizer.eos_token_id,
                num_beams=5,
                early_stopping=True,
                eos_token_id=pipe.tokenizer.eos_token_id
            )
    cot_reasoning = cot_output[0]['generated_text']
    prompt += f"{cot_reasoning}\n\nBased on the above, what is the single, most likely answer choice? \
                Answer in the format \"The correct answer is (insert answer here)\"."
    return prompt

# def chain_of_thought_prompt(example: Example) -> str: # with reasoning 5 examples
#     """Creates a chain-of-thought prompt given a single example."""
#     prompt = f"Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.\n"
#     json_data = _load_from_json("chain_of_thought_examples.json")
#     prompt += _generate_prompt_from_examples(json_data, with_explanations=True)
#     prompt += "\nNow your turn.\n"
#     prompt += f"Question: {example.question}"
#     prompt += f"\nChoices:\n(A) {example.choice1}\n(B) {example.choice2}\n(C) {example.choice3}\n(D) {example.choice4}"
#     prompt += "\nGive step by step reasoning before you answer, and when you're ready to answer, please use the format \"The correct answer is (insert answer here)\":\n"
#     return prompt

# def five_shot_prompt(example: Example) -> str: #with 5 examples # withput reasoning
#     """Creates a chain-of-thought prompt given a single example."""
#     prompt = f"Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
#     json_data = _load_from_json("chain_of_thought_examples.json")
#     prompt += _generate_prompt_from_examples(json_data, with_explanations=False)

#     prompt += f"Question: {example.question}"
#     prompt += f"\nChoices:\n(A) {example.choice1}\n(B) {example.choice2}\n(C) {example.choice3}\n(D) {example.choice4}"
#     prompt += "\nWhen you're ready to answer, please use the format \"The correct answer is (insert answer here)."
#     return prompt

def zero_shot_prompt(example: Example) -> str: # just the question and the choices
    """Creates a zero-shot prompt given a single example."""
    prompt = _base_prompt(example)
    prompt += f"\n\nPlease reply only with the correct option, do not say anything else.\""
    return prompt

In [30]:
df = pd.read_csv("../dataset/cleaned_dataset.csv", encoding="utf-8")
df.head()

Unnamed: 0,Question,Answer
0,How do I apply for CPT?,You must get approval from your Designated Sch...
1,What documents are required for CPT?,Typically you need:\n- CPT Request Form from y...
2,How long does CPT processing take?,Processing usually takes 5-10 business days af...
3,Can I work before receiving CPT authorization?,No you must wait for approval and receive an u...
4,Do I need to pay any fees for CPT?,No CPT does not require a separate application...


In [31]:
# # delete empty rows
# df = df.dropna()

In [25]:
# # remove ',' from df[Answer]
# df['Answer'] = df['Answer'].str.replace(',', '')

In [26]:
# df.to_csv("../dataset/cleaned_dataset.csv", index=False, encoding="utf-8")

In [32]:
# Initialize the model pipeline
#pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')


def runner(n=1, prompt_type='zero_shot_prompt'):
   # model_name = "meta-llama/Llama-3.2-1B-Instruct"
    for i in tqdm(range(n), desc="Iterations"):
        results = []
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
            try:
                question = row['Question'].strip()
                right_answer = row['Answer'].strip()
                # selecting distractions.
                option = []
                option.append(right_answer)
                while len(option) < 4:
                    distractor = df.sample(1)['Answer'].values[0].strip()
                    if distractor not in option and distractor != right_answer:
                        option.append(distractor)

                # print(f"Question: {question}")
                # print(f"Right Answer: {right_answer}")
                # print(f"Options: {option}")
                # print(len(option)) 
                # break

                # Create an example
                random.shuffle(option)
                example = Example(question, 
                                  option[0], 
                                  option[1], 
                                  option[2], 
                                  option[3]
                                )
                

                # Depending on prompt_type, generate the prompt using the integrated functions
                if prompt_type == 'zero_shot_prompt':
                    prompt = zero_shot_prompt(example)
                # elif prompt_type == 'chain_of_thought_prompt':
                #     prompt = chain_of_thought_prompt(example)
                # elif prompt_type == 'five_shot_prompt':
                #     prompt = five_shot_prompt(example)
                elif prompt_type == 'zero_shot_chain_of_thought':
                    pass
                   # prompt = zero_shot_chain_of_thought_prompt(example, pipe)
                else:
                    # Default behavior (original prompt)
                    prompt = zero_shot_prompt(example)
                    
                if index == 0:
                    print(prompt)
                    print("Correct Ans: ", right_answer)
                    break

                # Generate the model's response
                # response = pipe(
                #     prompt,
                #     max_new_tokens=256,
                #     pad_token_id=pipe.tokenizer.eos_token_id,
                #     num_beams=5,
                #     early_stopping=True,
                #     eos_token_id=pipe.tokenizer.eos_token_id
                # )



            except Exception as e:
                print(f"Error processing row {index}: {e}")
                continue    

In [33]:
runner(1, 'zero_shot_prompt')

Processing rows:   0%|          | 0/131 [00:00<?, ?it/s]
Iterations: 100%|██████████| 1/1 [00:00<00:00, 212.14it/s]

You are a smart law assistant. You are given a question and four possible answers. Your task is to choose the most likely answer and respond with only the option letter (A, B, C, D): 
Question: How do I apply for CPT?

Choices:
(A) No STEM OPT requires paid employment with an E-Verify employer.
(B) You must get approval from your Designated School Official (DSO) and submit a CPT request form along with an offer letter from your employer.
(C) Yes but all jobs must be related to your field of study.
(D) Processing usually takes 5-10 business days after submission to the DSO.

Please reply only with the correct option, do not say anything else."
Correct Ans:  You must get approval from your Designated School Official (DSO) and submit a CPT request form along with an offer letter from your employer.





In [12]:
df.head(1)

Unnamed: 0,Question,Answer
0,How do I apply for CPT?,You must get approval from your Designated Sch...
