In [1]:
import json
import random
import re
import os
import pandas as pd
from tqdm.notebook import tqdm
from transformers import pipeline
from unsloth import FastLanguageModel
from typing import Dict
import dotenv
dotenv.load_dotenv()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


False

In [2]:
# !pip install --upgrade jinja2

In [3]:
df = pd.read_csv("../../dataset/cleaned_dataset_answer_improved_reasoned.csv", encoding="utf-8")
df.head()

Unnamed: 0,Question,Answer,reasoning
0,Is a high school diploma required for an F-1 v...,A high school diploma or its equivalent is gen...,Question Understanding\nThe question asks whet...
1,Is it important to memorize my SEVIS ID?,"It's crucial to know your SEVIS ID, as it's yo...",Question Understanding\nThe question asks abou...
2,Is proof of housing required at the port of en...,While proof of housing is not always required ...,Question Understanding\nThe question asks whet...
3,What document does a school provide for an F-1...,"A school provides Form I-20, a Certificate of ...",Question Understanding\nThe question asks abou...
4,What if I plan to do research collaboration wi...,If asked about potential research collaboratio...,Question Understanding\nThe question concerns ...


In [4]:
df.shape

(1000, 3)

In [5]:
model_name = 'Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model'  # Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = 8192,
        dtype = None,
        load_in_4bit = True,
        )

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.381 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [7]:
LETTER_TO_INDEX = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

class Example:
    def __init__(self, question: str, choice1: str, choice2: str, choice3: str, choice4: str):
        self.question = question
        self.choice1 = choice1
        self.choice2 = choice2
        self.choice3 = choice3
        self.choice4 = choice4

    @property
    def choices(self):
        return [self.choice1, self.choice2, self.choice3, self.choice4]

def _base_prompt() -> str:
    return """Act as an expert legal assistant with comprehensive knowledge of statutory law and case precedent. Analyze the following legal question carefully, then select the correct answer from the given options through rigorous legal reasoning."""

def _format_choices(options: list[str]) -> str:
    return "\n".join(f"({chr(65 + i)}) {choice}" for i, choice in enumerate(options))

def _build_question_section(example: Example) -> str:
    return f"\n\nQuestion: {example.question}\nChoices:\n{_format_choices(example.choices)}"

def _build_instructions() -> str:
    return """\n\n
        Instructions:
        1. Conduct thorough legal analysis of all options
        2. Consider relevant statutes, regulations, and judicial interpretations
        3. Identify potential ambiguities or counterarguments
        4. Select only the BEST supported answer
        5. Respond SOLELY with the correct letter (A-D)

        Answer using this format:
        [X]"""

def _build_final_instruction() -> str:
    return "\n\nPlease reply only with the correct option, do not say anything else."

def _prepare_examples(example: Example, no: int = 5) -> str:
    filtered_df = df[df['Question'] != example.question].sample(frac=1)
    examples = []
    
    for _, row in filtered_df.head(no).iterrows():
        right_answer = str(row['Answer'])
        option = [right_answer]
        
        distractors = df[df['Answer'] != right_answer]['Answer'].astype(str).unique()
        if len(distractors) < 3:
            raise ValueError("Not enough unique distractors in the DataFrame.")
        option += random.sample(list(distractors), 3)
        
        random.shuffle(option)
        correct_letter = chr(option.index(right_answer) + 65)
        
        example_str = (
            f"\n\nQuestion: {row['Question']}"
            f"\nChoices:\n{_format_choices(option)}"
            f"\nThe correct answer is ({correct_letter})"
        )
        examples.append(example_str)
    
    return "--- START OF EXAMPLES ---\n" + "".join(examples) + "\n\n--- END OF EXAMPLES ---\n"

def chain_of_thought_prompt(example: Example, max_new_tokens: int = 256) -> str:
    prompt = _base_prompt() + _build_question_section(example)
    prompt += f"\n\nLet's analyze this step by step. First, understand the question. Next, evaluate each option in short (2-5) lines each. Also, you can generate up to {max_new_tokens} tokens to reason."

    messages = [{"role": "user", "content": prompt},]
    inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,  # Must add for generation
                return_tensors="pt",).to("cuda")
    outputs = model.generate(
                input_ids=inputs, 
                max_new_tokens=max_new_tokens, 
                use_cache=True, 
                temperature=1.5, 
                min_p=0.1)


    # Extract generated text
    cot_reasoning = tokenizer.decode(outputs[0], skip_special_tokens=True)
    prompt = f"{cot_reasoning}\n\nBased on the above, reasoning what is the single, most likely answer choice? \nAnswer using this format:\n[X]"
    prompt += _build_final_instruction()
    return prompt

def five_shot_prompt(example: Example, no: int = 5) -> str:
    prompt = _base_prompt()
    prompt += "\nHere are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
    prompt += _prepare_examples(example=example, no=no)
    prompt += "\n\nNow your turn. Choose the correct option that answers the below question.\n"
    prompt += _build_question_section(example)
    prompt += _build_instructions()
    prompt += _build_final_instruction()
    return prompt

def zero_shot_prompt(example: Example) -> str:
    prompt = _base_prompt() + _build_question_section(example)
    prompt += _build_instructions()
    prompt += _build_final_instruction()
    return prompt

In [8]:
def runner(n=1, prompt_type='zero_shot_prompt'):
    for i in tqdm(range(n), desc="Iterations"):
        results = []
        
        # Precompute paths and create directory once per iteration
        model_folder = model_name.split('/')[1]
        output_dir = f"../results/finetune_model_eval/{prompt_type}/{model_folder}/"
        output_file = os.path.join(output_dir, f"output_{i}.json")
        os.makedirs(output_dir, exist_ok=True)  # Create directory if needed
        
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
            try:
                question = row['Question']
                right_answer = row['Answer']
                # selecting distractions.
                option = []
                option.append(right_answer)
                while len(option) < 4:
                    distractor = df.sample(1)['Answer'].values[0]
                    if distractor not in option and distractor != right_answer:
                        option.append(distractor)

                # Create an example
                random.shuffle(option)
                right_option_index = option.index(right_answer)
                right_option_letter = chr(ord('A') + right_option_index)
                
                example = Example(question, 
                                  option[0], 
                                  option[1], 
                                  option[2], 
                                  option[3]
                                )
                
                # Depending on prompt_type, generate the prompt using the integrated functions
                if prompt_type == 'zero_shot_prompt':
                    prompt = zero_shot_prompt(example)
                elif prompt_type == 'five_shot_prompt':
                    n_examples = 5
                    prompt =  five_shot_prompt(example, n_examples)
                elif prompt_type == 'chain_of_thought_prompt':
                    thinking_tokens = 512
                    prompt = chain_of_thought_prompt(example, thinking_tokens)
                else:
                    # Default behavior (original prompt)
                    print("SELECTING DEFAULT PROMPTING TECHNIQUE")
                    prompt = zero_shot_prompt(example)

                # Prepare Inputs
                messages = [{"role": "user", "content": prompt},]
                inputs = tokenizer.apply_chat_template(
                            messages,
                            tokenize=True,
                            add_generation_prompt=True,  # Must add for generation
                            return_tensors="pt",).to("cuda")
                
                # Generate outputs
                outputs = model.generate(
                            input_ids=inputs, 
                            max_new_tokens=15, # Restrict to 15 tokens so that we dont get garbage
                            use_cache=True, 
                            temperature=1.5, 
                            min_p=0.1)


                # Decode output
                generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                answer_portion = generated_text.split("do not say anything else.")[1] # do not say anything else. : is the ending of our prompt
                
                match = re.search(r'[A-D]', answer_portion)
                if match:
                    model_answer_letter = match.group()
                else:
                    model_answer_letter = 'NA'
                
                if index == 0 and i==0:
                    print(generated_text)
                    print("Correct Ans: ", right_answer)
                    print("Model Replied with: ", model_answer_letter)
                    
                is_correct = (right_option_letter and model_answer_letter) and right_option_letter.lower() == model_answer_letter.lower()
                result = {
                    "iteration": i,
                    "id": index,
                    "model": model_name,
                    "prompt_type": prompt_type,
                    "prompt": prompt,
                    "question": question,
                    "right_answer": right_answer,
                    "right_answer_option": right_option_letter,
                    "model_answer_letter": model_answer_letter,
                    "generated_text": generated_text,
                    "answer_portion": answer_portion,
                    "is_correct": str(is_correct)
                }
                results.append(result)

                # Save progress every 50 successful completions
                if len(results) % 50 == 0:
                    #print('saving')
                    with open(output_file, 'w') as f:
                        json.dump(results, f, indent=4)

            except Exception as e:
                print(f"Error processing row {index}: {e}")
                continue 

        # Final save for remaining items after processing all rows
        if len(results) > 0:
            with open(output_file, 'w') as f:
                json.dump(results, f, indent=4)
                
        print(f"Iteration {i} complete. Results saved to {output_file}")
        print("Evaluation complete. Results saved to output.json.")    

In [9]:
runner(5, 'chain_of_thought_prompt')

Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Act as an expert legal assistant with comprehensive knowledge of statutory law and case precedent. Analyze the following legal question carefully, then select the correct answer from the given options through rigorous legal reasoning.

Question: Is a high school diploma required for an F-1 visa?
Choices:
(A) Upon changing employers during your STEM OPT, you must report the change within 10 days to your Designated School Official (DSO). This report should include a new I-983 training plan completed with your new employer, who must be enrolled in E-Verify. The new position must also be a bonafide job offer and directly related to your STEM degree.
(B) Yes, you can hold multiple CPT positions, but each job must directly relate to your field of study and be authorized separately by your university's Designated School Official (DSO). You'l

Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

Iteration 1 complete. Results saved to ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_1.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

Iteration 2 complete. Results saved to ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_2.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

Iteration 3 complete. Results saved to ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_3.json
Evaluation complete. Results saved to output.json.


Processing rows:   0%|          | 0/1000 [00:00<?, ?it/s]

Iteration 4 complete. Results saved to ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_4.json
Evaluation complete. Results saved to output.json.


In [11]:
import json
import glob
import os

prompt_type = 'chain_of_thought_prompt' #"chain_of_thought_prompt"  # five_shot_prompt # zero_shot_prompt
model_name =  "Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model"

# Construct the directory path
directory = f"../results/finetune_model_eval/{prompt_type}/{model_name.split('/')[1]}/" # When running on Jetstream
#directory = f"../results/finetune_model_eval/{prompt_type}/{model_name.split('/')[1]}/" # When running in VS CODE

# Find all JSON files matching output_*.json pattern
json_files = glob.glob(os.path.join(directory, "output_*.json"))

# Process each JSON file
for file_path in json_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Count occurrences of is_correct = True and is_correct = False
    true_count = sum(1 for item in data if item.get("is_correct") == 'True') # model answer == actual answer
    false_count = sum(1 for item in data if item.get("is_correct") == 'False') # model answer != actual answer
    na_count = sum(1 for item in data if item.get("is_correct") == 'NA') # Model was not able to answer/ Answer not found in response
    
    print(f"File: {file_path}")
    print(f"Total is_correct = True: {true_count}")
    print(f"Total is_correct = False: {false_count}")
    print(f"Total is_correct = NA: {na_count}")
    print("-" * 40) 


File: ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_3.json
Total is_correct = True: 934
Total is_correct = False: 66
Total is_correct = NA: 0
----------------------------------------
File: ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_1.json
Total is_correct = True: 953
Total is_correct = False: 47
Total is_correct = NA: 0
----------------------------------------
File: ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_2.json
Total is_correct = True: 945
Total is_correct = False: 55
Total is_correct = NA: 0
----------------------------------------
File: ../results/finetune_model_eval/chain_of_thought_prompt/Meta-Llama-3.1-8B-Instruct-law-lora_model/output_4.json
Total is_correct = True: 941
Total is_correct = False: 59
Total is_correct = NA: 0
----------------------------------------
File: ../results/finetune_model_eval