In [None]:
# !pip install pandas tqdm transformers numpy python-dotenv -q

In [None]:
# !pip install --upgrade jupyter ipywidgets -q


In [3]:
import json
import random
import re
import os
import pandas as pd
from tqdm.notebook import tqdm
from transformers import pipeline
from typing import Dict
import dotenv
dotenv.load_dotenv()

False

In [4]:
# HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
#!huggingface-cli login --token #HUGGINGFACE_API_KEY

In [5]:
df = pd.read_csv("../../dataset/cleaned_dataset_answer_improved_reasoned.csv", encoding="utf-8")
df.head()

Unnamed: 0,Question,Answer,reasoning
0,Is a high school diploma required for an F-1 v...,A high school diploma or its equivalent is gen...,Question Understanding\nThe question asks whet...
1,Is it important to memorize my SEVIS ID?,"It's crucial to know your SEVIS ID, as it's yo...",Question Understanding\nThe question asks abou...
2,Is proof of housing required at the port of en...,While proof of housing is not always required ...,Question Understanding\nThe question asks whet...
3,What document does a school provide for an F-1...,"A school provides Form I-20, a Certificate of ...",Question Understanding\nThe question asks abou...
4,What if I plan to do research collaboration wi...,If asked about potential research collaboratio...,Question Understanding\nThe question concerns ...


In [6]:
df.shape

(1000, 3)

In [7]:
LETTER_TO_INDEX = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

class Example:
    def __init__(self, question: str, choice1: str, choice2: str, choice3: str, choice4: str):
        self.question = question
        self.choice1 = choice1
        self.choice2 = choice2
        self.choice3 = choice3
        self.choice4 = choice4

    @property
    def choices(self):
        return [self.choice1, self.choice2, self.choice3, self.choice4]

def _base_prompt() -> str:
    return """Act as an expert legal assistant with comprehensive knowledge of statutory law and case precedent. Analyze the following legal question carefully, then select the correct answer from the given options through rigorous legal reasoning."""

def _format_choices(options: list[str]) -> str:
    return "\n".join(f"({chr(65 + i)}) {choice}" for i, choice in enumerate(options))

def _build_question_section(example: Example) -> str:
    return f"\n\nQuestion: {example.question}\nChoices:\n{_format_choices(example.choices)}"

def _build_instructions() -> str:
    return """\n\n
        Instructions:
        1. Conduct thorough legal analysis of all options
        2. Consider relevant statutes, regulations, and judicial interpretations
        3. Identify potential ambiguities or counterarguments
        4. Select only the BEST supported answer
        5. Respond SOLELY with the correct letter (A-D)

        Answer using this format:
        [X]"""

def _build_final_instruction() -> str:
    return "\n\nPlease reply only with the correct option, do not say anything else."

def _prepare_examples(example: Example, no: int = 5) -> str:
    filtered_df = df[df['Question'] != example.question].sample(frac=1)
    examples = []
    
    for _, row in filtered_df.head(no).iterrows():
        right_answer = str(row['Answer'])
        option = [right_answer]
        
        distractors = df[df['Answer'] != right_answer]['Answer'].astype(str).unique()
        if len(distractors) < 3:
            raise ValueError("Not enough unique distractors in the DataFrame.")
        option += random.sample(list(distractors), 3)
        
        random.shuffle(option)
        correct_letter = chr(option.index(right_answer) + 65)
        
        example_str = (
            f"\n\nQuestion: {row['Question']}"
            f"\nChoices:\n{_format_choices(option)}"
            f"\nThe correct answer is ({correct_letter})"
        )
        examples.append(example_str)
    
    return "--- START OF EXAMPLES ---\n" + "".join(examples) + "\n\n--- END OF EXAMPLES ---\n"

def chain_of_thought_prompt(example: Example, max_new_tokens: int = 256) -> str:
    prompt = _base_prompt() + _build_question_section(example)
    prompt += f"\n\nLet's analyze this step by step. First, understand the question. Next, evaluate each option in short (2-5) lines each. Also, you can generate up to {max_new_tokens} tokens to reason."

    cot_output = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        pad_token_id=pipe.tokenizer.eos_token_id,
        num_beams=5,
        early_stopping=True,
        eos_token_id=pipe.tokenizer.eos_token_id,
        temperature=0.5
    )
    cot_reasoning = cot_output[0]['generated_text']
    prompt = f"{cot_reasoning}\n\nBased on the above, reasoning what is the single, most likely answer choice? \nAnswer using this format:\n[X]"
    prompt += _build_final_instruction()
    return prompt

def five_shot_prompt(example: Example, no: int = 5) -> str:
    prompt = _base_prompt()
    prompt += "\nHere are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
    prompt += _prepare_examples(example=example, no=no)
    prompt += "\n\nNow your turn. Choose the correct option that answers the below question.\n"
    prompt += _build_question_section(example)
    prompt += _build_instructions()
    prompt += _build_final_instruction()
    return prompt

def zero_shot_prompt(example: Example) -> str:
    prompt = _base_prompt() + _build_question_section(example)
    prompt += _build_instructions()
    prompt += _build_final_instruction()
    return prompt

In [8]:
# # delete empty rows
# df = df.dropna()

In [9]:
# # remove ',' from df[Answer]
# df['Answer'] = df['Answer'].str.replace(',', '')
# df.to_csv("../dataset/cleaned_dataset.csv", index=False, encoding="utf-8")

In [10]:
#model_name = 'meta-llama/Llama-2-7b-chat-hf'
model_name = 'meta-llama/Llama-3.1-8B-Instruct' #"meta-llama/Llama-3.2-3B-Instruct"# "meta-llama/Llama-3.1-8B-Instruct"
pipe = pipeline ("text-generation", model=model_name, device='cuda')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda


In [11]:
# Initialize the model pipeline
#pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')

def runner(n=1, prompt_type='zero_shot_prompt'):
   # model_name = "meta-llama/Llama-3.2-1B-Instruct"
    for i in tqdm(range(n), desc="Iterations"):
        results = []
        
        # Precompute paths and create directory once per iteration
        model_folder = model_name.split('/')[1]
        output_dir = f"../results/base_model_eval/{prompt_type}/{model_folder}/"
        output_file = os.path.join(output_dir, f"output_{i}.json")
        os.makedirs(output_dir, exist_ok=True)  # Create directory if needed
        
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
            try:
                question = row['Question']
                right_answer = row['Answer']
                # selecting distractions.
                option = []
                option.append(right_answer)
                while len(option) < 4:
                    distractor = df.sample(1)['Answer'].values[0]
                    if distractor not in option and distractor != right_answer:
                        option.append(distractor)

                # print(f"Question: {question}")
                # print(f"Right Answer: {right_answer}")
                # print(f"Options: {option}")
                # print(len(option)) 
                # break

                # Create an example
                random.shuffle(option)
                # Right option is 
                right_option_index = option.index(right_answer)
                right_option_letter = chr(ord('A') + right_option_index)
                #print(f"\n\nRight option index: {right_option_letter}")
                
                example = Example(question, 
                                  option[0], 
                                  option[1], 
                                  option[2], 
                                  option[3]
                                )
                
                # Depending on prompt_type, generate the prompt using the integrated functions
                if prompt_type == 'zero_shot_prompt':
                    prompt = zero_shot_prompt(example)
                elif prompt_type == 'five_shot_prompt':
                    n_examples = 5
                    prompt =  five_shot_prompt(example, n_examples)
                elif prompt_type == 'chain_of_thought_prompt':
                    thinking_tokens = 512
                    prompt = chain_of_thought_prompt(example, thinking_tokens)
                else:
                    # Default behavior (original prompt)
                    print("SELECTING DEFAULT PROMPTING TECHNIQUE")
                    prompt = zero_shot_prompt(example)

                #print(prompt)
                #break
                # Generate the model's response
                response = pipe(
                    prompt,
                    max_new_tokens=15, # Preferably 15
                    pad_token_id=pipe.tokenizer.eos_token_id,
                    num_beams=5,
                    early_stopping=True,
                    eos_token_id=pipe.tokenizer.eos_token_id,
                    temperature=0.1
                )

                # Extract generated text
                generated_text = response[0]["generated_text"]
                #print(generated_text)
                answer_portion = generated_text.split("do not say anything else.")[1] # do not say anything else. : is the ending of our prompt
                match = re.search(r'[A-D]', answer_portion)
                if match:
                    model_answer_letter = match.group()
                else:
                    model_answer_letter = 'NA'
                
                                
                if index == 0 and i==0:
                    print(generated_text)
                    print("Correct Ans: ", right_answer)
                    print("Model Replied with: ", model_answer_letter)
                    
                is_correct = (right_option_letter and model_answer_letter) and right_option_letter.lower() == model_answer_letter.lower()

                result = {
                    "iteration": i,
                    "id": index,
                    "model": model_name,
                    "prompt_type": prompt_type,
                    "prompt": prompt,
                    "question": question,
                    "right_answer": right_answer,
                    "right_answer_option": right_option_letter,
                    "model_answer_letter": model_answer_letter,
                    "generated_text": generated_text,
                    "answer_portion": answer_portion,
                    "is_correct": str(is_correct)
                }
                results.append(result)

                # Save progress every 50 successful completions
                if len(results) % 50 == 0:
                    print('saving')
                    with open(output_file, 'w') as f:
                        json.dump(results, f, indent=4)

            except Exception as e:
                print(f"Error processing row {index}: {e}")
                print(generated_text)
                continue 

        # Final save for remaining items after processing all rows
        if len(results) > 0:
            with open(output_file, 'w') as f:
                json.dump(results, f, indent=4)
                
        print(f"Iteration {i} complete. Results saved to {output_file}")
        print("Evaluation complete. Results saved to output.json.")    

In [12]:
runner(5, 'five_shot_prompt')

Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

Act as an expert legal assistant with comprehensive knowledge of statutory law and case precedent. Analyze the following legal question carefully, then select the correct answer from the given options through rigorous legal reasoning.
Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.
--- START OF EXAMPLES ---


Question: Can I do OPT after my masters if I already did it after my bachelors?
Choices:
(A) The designated school official (DSO) at the student's academic institution is authorized to approve an extension of their Form I-20, Certificate of Eligibility for Nonimmigrant Student Status, which is needed to maintain F-1 visa status. However, the F-1 visa itself, which allows entry into the U.S., cannot be extended from within the U.S.; students must apply for a new visa at a U.S. embassy or consulate abroad if their visa expires. The DSO's approval of the I-20 extension demonstrates that the stud

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
Iteration 0 complete. Results saved to ../results/base_model_eval/five_shot_prompt/Llama-3.1-8B-Instruct/output_0.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
Iteration 1 complete. Results saved to ../results/base_model_eval/five_shot_prompt/Llama-3.1-8B-Instruct/output_1.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
Iteration 2 complete. Results saved to ../results/base_model_eval/five_shot_prompt/Llama-3.1-8B-Instruct/output_2.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
Iteration 3 complete. Results saved to ../results/base_model_eval/five_shot_prompt/Llama-3.1-8B-Instruct/output_3.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
saving
Iteration 4 complete. Results saved to ../results/base_model_eval/five_shot_prompt/Llama-3.1-8B-Instruct/output_4.json
Evaluation complete. Results saved to output.json.


In [5]:
import json
import glob
import os

prompt_type = 'zero_shot_prompt' #"chain_of_thought_prompt"  # five_shot_prompt # zero_shot_prompt
#model_name = 'meta-llama/Llama-3.2-3B-Instruct'
#model_name = "deepseek-ai/deepseek-llm-7b-chat" 
model_name =  "meta-llama/Llama-3.1-8B-Instruct"

# Construct the directory path
directory = f"../results/base_model_eval/{prompt_type}/{model_name.split('/')[1]}/" # When running on Jetstream
#directory = f"../results/base_model_eval/{prompt_type}/{model_name.split('/')[1]}/" # When running in VS CODE

# Find all JSON files matching output_*.json pattern
json_files = glob.glob(os.path.join(directory, "output_*.json"))

# Process each JSON file
for file_path in json_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Count occurrences of is_correct = True and is_correct = False
    true_count = sum(1 for item in data if item.get("is_correct") == 'True') # model answer == actual answer
    false_count = sum(1 for item in data if item.get("is_correct") == 'False') # model answer != actual answer
    na_count = sum(1 for item in data if item.get("model_answer_letter") == 'NA') # Model was not able to answer/ Answer not found in response
    
    print(f"File: {file_path}")
    print(f"Total is_correct = True: {true_count}")
    print(f"Total is_correct = False: {false_count}")
    print(f"Total is_correct = NA: {na_count}")
    print("-" * 40) 


File: ../results/base_model_eval/zero_shot_prompt/Llama-3.1-8B-Instruct/output_1.json
Total is_correct = True: 890
Total is_correct = False: 110
Total is_correct = NA: 102
----------------------------------------
File: ../results/base_model_eval/zero_shot_prompt/Llama-3.1-8B-Instruct/output_0.json
Total is_correct = True: 874
Total is_correct = False: 126
Total is_correct = NA: 117
----------------------------------------
File: ../results/base_model_eval/zero_shot_prompt/Llama-3.1-8B-Instruct/output_4.json
Total is_correct = True: 904
Total is_correct = False: 93
Total is_correct = NA: 87
----------------------------------------
File: ../results/base_model_eval/zero_shot_prompt/Llama-3.1-8B-Instruct/output_3.json
Total is_correct = True: 885
Total is_correct = False: 113
Total is_correct = NA: 112
----------------------------------------
File: ../results/base_model_eval/zero_shot_prompt/Llama-3.1-8B-Instruct/output_2.json
Total is_correct = True: 891
Total is_correct = False: 107
Total

In [14]:
# Old code
'''
# Initialize the model pipeline
#pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')

def runner(n=1, prompt_type='zero_shot_prompt'):
   # model_name = "meta-llama/Llama-3.2-1B-Instruct"
    for i in tqdm(range(n), desc="Iterations"):
        results = []
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
            try:
                question = row['Question']
                right_answer = row['Answer']
                # selecting distractions.
                option = []
                option.append(right_answer)
                while len(option) < 4:
                    distractor = df.sample(1)['Answer'].values[0]
                    if distractor not in option and distractor != right_answer:
                        option.append(distractor)

                # print(f"Question: {question}")
                # print(f"Right Answer: {right_answer}")
                # print(f"Options: {option}")
                # print(len(option)) 
                # break

                # Create an example
                random.shuffle(option)
                # Right option is 
                right_option_index = option.index(right_answer)
                right_option_letter = chr(ord('A') + right_option_index)
                #print(f"\n\nRight option index: {right_option_letter}")
                
                example = Example(question, 
                                  option[0], 
                                  option[1], 
                                  option[2], 
                                  option[3]
                                )
                
                # Depending on prompt_type, generate the prompt using the integrated functions
                if prompt_type == 'zero_shot_prompt':
                    prompt = zero_shot_prompt(example)
                elif prompt_type == 'five_shot_prompt':
                    n_examples = 5
                    prompt =  five_shot_prompt(example, n_examples)
                elif prompt_type == 'chain_of_thought_prompt':
                    thinking_tokens = 512
                    prompt = chain_of_thought_prompt(example, thinking_tokens)
                else:
                    # Default behavior (original prompt)
                    print("SELECTING DEFAULT PROMPTING TECHNIQUE")
                    prompt = zero_shot_prompt(example)

                #print(prompt)
                #break
                # Generate the model's response
                response = pipe(
                    prompt,
                    max_new_tokens=15, # Preferably 15
                    pad_token_id=pipe.tokenizer.eos_token_id,
                    num_beams=5,
                    early_stopping=True,
                    eos_token_id=pipe.tokenizer.eos_token_id,
                    temperature=0.1
                )

                # Extract generated text
                generated_text = response[0]["generated_text"]
                #print(generated_text)
                answer_portion = generated_text.split("else.")[1]
                match = re.search(r'[A-D]', answer_portion)
                if match:
                    model_answer_letter = match.group()
                else:
                    model_answer_letter = 'NA'
                
                                
                if index == 0 and i==0:
                    print(generated_text)
                    print("Correct Ans: ", right_answer)
                    print("Model Replied with: ", model_answer_letter)
                    
                is_correct = (right_option_letter and model_answer_letter) and right_option_letter.lower() == model_answer_letter.lower()

                result = {
                    "iteration": i,
                    "id": index,
                    "model": model_name,
                    "prompt_type": prompt_type,
                    "prompt": prompt,
                    "question": question,
                    "right_answer": right_answer,
                    "right_answer_option": right_option_letter,
                    "model_answer_letter": model_answer_letter,
                    "generated_text": generated_text,
                    "answer_portion": answer_portion,
                    "is_correct": str(is_correct)
                }
                results.append(result)

            except Exception as e:
                print(f"Error processing row {index}: {e}")
                print(generated_text)
                continue 

        # Save the results
        if not os.path.exists(f"results/base_model_eval/{prompt_type}/{model_name.split('/')[1]}"):
            os.makedirs(f"results/base_model_eval/{prompt_type}/{model_name.split('/')[1]}")

        with open(f"results/base_model_eval/{prompt_type}/{model_name.split('/')[1]}/output_{i}.json", "w") as f:
                json.dump(results, f, indent=4)
    
        print("Evaluation complete. Results saved to output.json.")    
               
'''

'\n# Initialize the model pipeline\n#pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct", device=\'cuda\')\n\ndef runner(n=1, prompt_type=\'zero_shot_prompt\'):\n   # model_name = "meta-llama/Llama-3.2-1B-Instruct"\n    for i in tqdm(range(n), desc="Iterations"):\n        results = []\n        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):\n            try:\n                question = row[\'Question\']\n                right_answer = row[\'Answer\']\n                # selecting distractions.\n                option = []\n                option.append(right_answer)\n                while len(option) < 4:\n                    distractor = df.sample(1)[\'Answer\'].values[0]\n                    if distractor not in option and distractor != right_answer:\n                        option.append(distractor)\n\n                # print(f"Question: {question}")\n                # print(f"Right Answer: {right_answer}")\n                # prin