# Math Question Answer Verification Competition

## Starter Code

Borrowed from [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR)

In [None]:
# %%capture
# This cell will take time
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth
  Downloading unsloth-2024.11.7-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.5-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Download

Found existing installation: unsloth 2024.11.7
Uninstalling unsloth-2024.11.7:
  Successfully uninstalled unsloth-2024.11.7
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-8_cwi0at/unsloth_72a244de82fb4fd8b4dd75ebef3139a0
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-8_cwi0at/unsloth_72a244de82fb4fd8b4dd75ebef3139a0
  Resolved https://github.com/unslothai/unsloth.git to commit f26d4e739ed507de7a9088da53d10fd02f58d160
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2024.11.7-py3-none-a

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any //////////// 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [None]:
SELF_MODEL = True # Use trained before or not


if SELF_MODEL :
  # Load from Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

  # Or comment above and use the downloaded local file
  # Remember to change below model_name and use your model name

  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = "/content/drive/MyDrive/DeepLearning/lora_model", # YOUR MODEL YOU USED FOR TRAINING
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
      )

In [None]:
if not SELF_MODEL :
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = "unsloth/Meta-Llama-3.1-8B", # unsloth/llama-3-8b-bnb-4bit
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
  )

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

## Load model and wrap with LoRA adapters

In [None]:
if not SELF_MODEL : # If not change the adapters config used in trained model
  model = FastLanguageModel.get_peft_model(
      model,
      r = 128, # Choose any number > 0 ! Suggested [8, 16, 32, 64, 128]
      target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
      lora_alpha = 128, # Choose any number > 0 ! Suggested [8, 16, 32, 64, 128]
      lora_dropout = 0, # Supports any, but = 0 is optimized
      bias = "none",    # Supports any, but = "none" is optimized
      # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
      use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
      random_state = 3407,
      use_rslora = False,  # We support rank stabilized LoRA
      loftq_config = None, # And LoftQ
  )

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Competition dataset

In [None]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

### Prompt

In [None]:
'''prompt not using Explanation (examples["solution"])'''
prompt = """You are a great mathematician and you are tasked with finding if an answer to a given maths question is correct or not. Yout response should be 'True' if correct, otherwise 'False'. Below is Question and Answer.



### Question:
{}

### Answer:
{}

### Explainaition

### Output:
{}"""

'''prompt using Explanation (examples["solution"])'''
prompt = """You are a great mathematician and you are tasked with finding if an answer to a given maths question is correct or not. Your job is to carefully review each step of the explanation and ensure the solution fully addresses all details in the question.

After reviewing, respond:
- only "True" if the answer and explanation are both correct and complete, with no mistakes.
- only "False" if you find any incorrect steps, missing information, or if the answer is incorrect.

Below is the Question, Answer, and Explanation for your analysis:

### Question:
{}

### Answer:
{}

### Explanation:
{}

### Output:
{}"""

'''prompt using Explanation (examples["solution"]), modified to add Analytical Framework and Step-by-Step Validation'''
prompt = """You are a highly skilled mathematician tasked with evaluating a mathematical solution. Carefully analyze each step of the solution, and determine if the answer is correct.

Your response should:
1. Identify any incorrect steps in the solution.
2. Note any missing information from the question that could affect the answer.
3. Confirm the accuracy of the given answer.

After your analysis, respond in the output with:
- "True" if the answer and solution are correct and complete, with no mistakes.
- "False" if there are any errors in the steps, missing information, or if the final answer is incorrect.

Below is the Question, Answer, Solution, and Correctness Indicator for your analysis:

### Question:
{}

### Given Answer:
{}

### Solution:
{}

### Output:
{}"""


### Discarded Approach: Summarization Model Integration for Token Limit Control

To prevent questions or solutions from exceeding the maximum token limit, use a summarization model to condense the content of both. This will help avoid output errors, such as irrelevant phrases or incomplete code blocks, for example:
```
4,False
5,False
6,False
7,"False
\### Explanation:
I made a mistake in my previous answer. Let's solve this problem using sympy.
\<llm-code>
from sympy import Rational
n_marbles = 10
r_marbles = 6
k = 3
\# Calculate probability for drawing 3 green marbles and"
```

#### Discarded
This method was ultimately discarded as it did not successfully prevent these types of unwanted outputs.

In [None]:
'''
from transformers import pipeline

Check if a GPU is available
device = 0 if torch.cuda.is_available() else -1
summarization_model = pipeline("summarization", model="t5-small", device=device)

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def summarize_text(text):
    # Use a summarization model (e.g., Hugging Face's transformers)
    summarized = summarization_model(text, max_length=500, min_length=10, truncation=True)
    return summarized[0]["summary_text"]

def formatting_prompts_func(examples):
    question = examples["question"]
    ans = examples["answer"]
    explanation = examples["solution"]
    output = examples["is_correct"]
    texts = []
    for instruction, input, expl, output in zip(question, ans, explanation, output):
        # Get token length of the question and explanation
        question_tokens = len(tokenizer.tokenize(instruction))
        explanation_tokens = len(tokenizer.tokenize(expl))

        # Summarize if the question or explanation exceeds a specified token length
        if question_tokens > 500:
            instruction = summarize_text(instruction)
        if explanation_tokens > 500:
            expl = summarize_text(expl)
        # Format text
        text = prompt.format(instruction, input, expl, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}
'''

'\nfrom transformers import pipeline\n\nCheck if a GPU is available\ndevice = 0 if torch.cuda.is_available() else -1\nsummarization_model = pipeline("summarization", model="t5-small", device=device)\n\nEOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN\ndef summarize_text(text):\n    # Use a summarization model (e.g., Hugging Face\'s transformers)\n    summarized = summarization_model(text, max_length=500, min_length=10, truncation=True)\n    return summarized[0]["summary_text"]\n\ndef formatting_prompts_func(examples):\n    question = examples["question"]\n    ans = examples["answer"]\n    explanation = examples["solution"]\n    output = examples["is_correct"]\n    texts = []\n    for instruction, input, expl, output in zip(question, ans, explanation, output):\n        # Get token length of the question and explanation\n        question_tokens = len(tokenizer.tokenize(instruction))\n        explanation_tokens = len(tokenizer.tokenize(expl))\n\n        # Summarize if the question or 

### Format Prompts for training or evaluation

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    question = examples["question"]
    ans = examples["answer"]
    explanation = examples["solution"]
    output = examples["is_correct"]
    texts = []
    for instruction, input, expl, output in zip(question, ans, explanation, output):
      # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, expl, output) + EOS_TOKEN # Format the text and add EOS_TOKEN
        texts.append(text)
    return {"text": texts} # Return a dictionary containing the formatted texts


def formatting_prompts_func_without_solution(examples):
    question = examples["question"]
    ans       = examples["answer"]
    output      = examples["is_correct"]
    texts = []
    for instruction, input, output in zip(question, ans, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

### Data Balance

In [None]:
from sklearn.model_selection import train_test_split

# Filter the original dataset into two parts based on the 'is_correct' attribute
true_data = dataset['train'].filter(lambda example: example["is_correct"] == True) # 400,000
false_data = dataset['train'].filter(lambda example: example["is_correct"] == False) # 600,000

# Select 10,000 examples from true_data and false_data
# First time
# true_data_sample = true_data.select(range(10000))
# false_data_sample = false_data.select(range(10000))

# Second time
true_data_sample = true_data.select(range(10000 , 20000))
false_data_sample = false_data.select(range(10000 , 20000))

# Third time
# true_data_sample = true_data.select(range(20000 , 30000))
# false_data_sample = false_data.select(range(20000 , 30000))

# Verify the size of each sample
print(f"Size of True data sample: {len(true_data_sample)}")  # Should print 10,000
print(f"Size of False data sample: {len(false_data_sample)}")  # Should print 10,000


Size of True data sample: 10000
Size of False data sample: 10000


#### Training Dataset \(19800 samples\) and Evaluation Dataset \(200 samples\)

In [None]:
# Define the percentage for the subset
subset_percentage = 0.01  # Adjust as needed

# Select part of each class (True and False) using train_test_split
true_subset = true_data_sample.train_test_split(test_size=subset_percentage, seed=54)['train']
false_subset = false_data_sample.train_test_split(test_size=subset_percentage, seed=54)['train']

true_eval_subset = true_data_sample.train_test_split(test_size=subset_percentage, seed=54)['test']
false_eval_subset = false_data_sample.train_test_split(test_size=subset_percentage, seed=54)['test']

In [None]:
from datasets import concatenate_datasets
# Use concatenate_datasets to create a balanced subset
train_subset_data = concatenate_datasets([true_subset, false_subset])

# Shuffle the dataset to mix True and False examples randomly
train_subset_data = train_subset_data.shuffle(seed=50)

# Apply the formatting function only to the balanced and shuffled subset
train_dataset = train_subset_data.map(formatting_prompts_func, batched=True, batch_size=16)


eval_data = concatenate_datasets([true_eval_subset, false_eval_subset])

# Shuffle the dataset to mix True and False examples randomly
eval_data = eval_data.shuffle(seed=50)

eval_dataset = eval_data.map(formatting_prompts_func, batched=True, batch_size=16)

In [None]:
#print a smaple training example
train_dataset['text'][0]

'You are a highly skilled mathematician tasked with evaluating a mathematical solution. Carefully analyze each step of the solution, and determine if the answer is correct.\n\nYour response should:\n1. Identify any incorrect steps in the solution.\n2. Note any missing information from the question that could affect the answer.\n3. Confirm the accuracy of the given answer.\n\nAfter your analysis, respond in the output with:\n- "True" if the answer and solution are correct and complete, with no mistakes.\n- "False" if there are any errors in the steps, missing information, or if the final answer is incorrect.\n\nBelow is the Question, Answer, Solution, and Correctness Indicator for your analysis:\n\n### Question:\nThere are 50 goldfish in the pond. Each goldfish eats 1.5 ounces of food per day. 20% of the goldfish need to eat special food that costs $3 an ounce. How much does it cost to feed these fish?\n\n### Given Answer:\n45\n\n### Solution:\nLet\'s solve this problem using Python cod

In [None]:
lengths = [len(tokenizer.encode(text)) for text in train_dataset['text'][:]]
print("Average length:", sum(lengths) / len(lengths))
print("Max length:", max(lengths))

Average length: 384.7808080808081
Max length: 1929


#### Sub-test set
Select 500 data points from the data that have not been used for training or evaluation as a pre-submission sub-test set to assess the model's quality and prediction accuracy in advance.

In [None]:
data_number = 500
index_selected = 50000

pretest_true_data_sample = true_data.select(range(index_selected , index_selected + data_number))
pretest_false_data_sample = false_data.select(range(index_selected , index_selected + data_number))

pretest_data = concatenate_datasets([pretest_true_data_sample, pretest_false_data_sample])

# Shuffle the dataset to mix True and False examples randomly
pretest_data = pretest_data.shuffle(seed=50)

# Apply the formatting function
pretest_dataset = pretest_data.map(formatting_prompts_func, batched=True, batch_size=16)

## Training

In [None]:
import wandb # Import the Weights & Biases (wandb) library for experiment tracking
wandb.login() # Log in to Weights & Biases to enable tracking and logging of experiments

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkz2643[0m ([33mkz2643-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

### SFT

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

project_name = "llama3_final"
entity = "wandb"
# os.environ["WANDB_LOG_MODEL"] = "checkpoint"

wandb.init(project=project_name, name = "math problem test")

training_args = TrainingArguments(
        per_device_train_batch_size = 2, # Increased batch size to better utilize GPU memory if available. [2, 4, 8]
        gradient_accumulation_steps = 4, # Reduced steps for faster convergence with higher batch size. [2, 4]
        warmup_steps = 10,  # Increased warmup steps for smoother training start [5, 10, 15, 25, 50]
        num_train_epochs = 1, # Set this for full training runs to get better model convergence. [1, 2, 3] If max_steps is given, it will override any value given in num_train_epochs
        # max_steps = 50,  # Increased max steps for more comprehensive training.10 is faster
        learning_rate = 1e-4,  # Reduced learning rate for finer adjustments and stability. [1e-5, 1e-4, 2e-4]
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",# "adamw_8bit" Changed optimizer to "adamw_torch" for potentially better performance. Or "sgd", /////
        weight_decay = 0.01, # [0.00, 0.01, 0.02, 0.05]
        lr_scheduler_type = "linear", # "consine", ////////
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB, TensorBoard, etc
        eval_strategy = "steps",# Evaluate every logging_steps
        eval_steps = 10, # Evaluate every 10 steps
        save_strategy = "steps",
        save_steps = 200,
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences. True,  # Enabled packing to improve training speed, especially for short sequences.
    # packing = True may lead to many wired response in the output csv
    args = training_args
)
trainer_stats = trainer.train() # trainer_stats = trainer.train(resume_from_checkpoint = True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19,800 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,475
 "-____-"     Number of trainable parameters = 335,544,320


Step,Training Loss,Validation Loss
10,0.316,0.431493
20,0.3562,0.438394
30,0.3389,0.447083
40,0.3767,0.446301
50,0.3107,0.446954
60,0.3788,0.447613
70,0.361,0.447646
80,0.26,0.45018
90,0.2941,0.449555
100,0.3595,0.450505


## inference

In [None]:
# Sample inference data point
test_dataset = dataset['test']
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)
sample_ques = test_dataset['question'][0]
sample_ans = test_dataset['answer'][0]
sample_exl = test_dataset['solution'][0]

In [None]:
# Running inference on single test
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
input_prompt = prompt.format(
        sample_ques, # ques
        sample_ans, # given answer
        sample_exl, # given explaination
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )

print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

Input Promt:
 You are a highly skilled mathematician tasked with evaluating a mathematical solution. Carefully analyze each step of the solution, and determine if the answer is correct.

Your response should:
1. Identify any incorrect steps in the solution.
2. Note any missing information from the question that could affect the answer.
3. Confirm the accuracy of the given answer.

After your analysis, respond in the output with:
- "True" if the answer and solution are correct and complete, with no mistakes.
- "False" if there are any errors in the steps, missing information, or if the final answer is incorrect.

Below is the Question, Answer, Solution, and Correctness Indicator for your analysis:

### Question:
The Parker family needs to leave the house by 5 pm for a dinner party. Mrs. Parker was waiting to get into the bathroom at 2:30 pm. Her oldest daughter used the bathroom for 45 minutes and her youngest daughter used the bathroom for another 30 minutes. Then her husband used it f

['False']

In [None]:
import csv
import re
from collections import Counter

FastLanguageModel.for_inference(model) # Enable native 2x faster inference


# Initialize variables
predictions = 0
results = []
batch_size = 16  # Adjust according to memory capacity [16, 32, 64]
num_samples = len(test_dataset) # Final result
num_samples_test = len(pretest_dataset) # Use to detect the pre-submission sub-test set correct rate
num_samples_eval = len(eval_dataset) # Use to detect the evaluation dataset correct rate

# Open a CSV file for writing
with open('results.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["ID", "is_correct"])
    writer.writeheader()  # Write header for the CSV file

    # Iterate over each test sample in batches
    for start_idx in range(0, num_samples, batch_size):
        # End index of the batch
        end_idx = min(start_idx + batch_size, num_samples)

        # Get a batch of samples
        batch_samples = [test_dataset[i] for i in range(start_idx, end_idx)]

        # Format input prompts for the batch
        input_prompts = [
            prompt.format(sample["question"], sample["answer"], sample["solution"], "")
            for sample in batch_samples
        ]

        # Tokenize input and move to the correct device
        inputs = tokenizer(input_prompts, return_tensors="pt", padding=True).to("cuda")
        input_shape = inputs['input_ids'].shape
        input_token_len = input_shape[1]

        # Generate output from the model
        outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
        batch_responses = tokenizer.batch_decode(
            outputs[:,input_token_len:],
            skip_special_tokens=True
        )

        # Process each response in the batch
        for i, (sample, response) in enumerate(zip(batch_samples, batch_responses)):
            # Clean up response to find "True" or "False" using regex
            # match = re.search(r'\b(True|False)\b', response.strip())
            # is_correct = match.group(0) == 'True' if match else None

            # Append result
            results.append({"ID": start_idx + i, "is_correct": response.strip()})
            predictions += 1

        # Write to CSV every 200 predictions to manage memory
        if predictions % 160 == 0:
            writer.writerows(results)
            results = []  # Clear results list to free up memory
            print(f"Written {predictions} predictions to CSV.")
        # Generate output from the model ten times for each sample
        # all_batch_responses = []
        # for _ in range(10):
        #     outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
        #     batch_responses = tokenizer.batch_decode(
        #         [output[input_token_len:] for output in outputs],
        #         skip_special_tokens=True
        #     )
        #     all_batch_responses.append(batch_responses)

        # # Process each response in the batch
        # for i, sample in enumerate(batch_samples):
        #     # Collect ten responses for each sample
        #     responses = [all_batch_responses[j][i].strip() for j in range(10)]

        #     # Count occurrences of "True" and "False" using Counter
        #     response_counter = Counter(responses)
        #     final_response = response_counter.most_common(1)[0][0]  # Get the most frequent response

        #     # Append result
        #     results.append({"ID": start_idx + i, "is_correct": final_response})
        #     predictions += 1

    # Write any remaining results after the loop ends
    if results:
        writer.writerows(results)
        print(f"Written remaining {len(results)} predictions to CSV.")


Written 160 predictions to CSV.
Written 320 predictions to CSV.
Written 480 predictions to CSV.
Written 640 predictions to CSV.
Written 800 predictions to CSV.
Written 960 predictions to CSV.
Written 1120 predictions to CSV.
Written 1280 predictions to CSV.
Written 1440 predictions to CSV.
Written 1600 predictions to CSV.
Written 1760 predictions to CSV.
Written 1920 predictions to CSV.
Written 2080 predictions to CSV.
Written 2240 predictions to CSV.
Written 2400 predictions to CSV.
Written 2560 predictions to CSV.
Written 2720 predictions to CSV.
Written 2880 predictions to CSV.
Written 3040 predictions to CSV.
Written 3200 predictions to CSV.
Written 3360 predictions to CSV.
Written 3520 predictions to CSV.
Written 3680 predictions to CSV.
Written 3840 predictions to CSV.
Written 4000 predictions to CSV.
Written 4160 predictions to CSV.
Written 4320 predictions to CSV.
Written 4480 predictions to CSV.
Written 4640 predictions to CSV.
Written 4800 predictions to CSV.
Written 4960 pre

#### Clean the data \: Make sure the result containing only True|False



In [None]:
import pandas as pd
import re

# Load your final results, assuming it’s in a CSV file
results_df = pd.read_csv('results.csv')

# Define a function to clean each entry in the 'is_correct' column
def clean_is_correct(entry):
    match = re.search(r'\b(True|False)\b', str(entry).strip())
    return match.group(0) if match else None

# Apply the cleaning function to the 'is_correct' column
results_df['is_correct'] = results_df['is_correct'].apply(clean_is_correct)

# Convert cleaned values to boolean if needed
results_df['is_correct'] = results_df['is_correct'].map({'True': True, 'False': False, None: False})

# Save the cleaned results to a new CSV file
results_df.to_csv('cleaned_results.csv', index=False)

#### Pre-test

In [None]:
import pandas as pd

# Load the CSV file
results_df = pd.read_csv('cleaned_results.csv')
# results_df = pd.read_csv('results.csv')

# Extract the 'is_correct' column from the CSV
results_is_correct = results_df['is_correct']

# Perform a comparison between the two 'is_correct' lists
comparison = results_is_correct == pretest_dataset['is_correct']  # This creates a boolean Series for element-wise comparison

# Calculate how many are correct and how many are not
correct_matches = comparison.sum()
total_count = len(comparison)
accuracy = correct_matches / total_count * 100

print(f"Number of matches: {correct_matches}/{total_count}")
print(f"Accuracy: {accuracy:.2f}%")

## Saving model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any ////////////
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
