## Import Dependacies

In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
!pip install --upgrade bitsandbytes peft
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git
# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [None]:
from unsloth import FastLanguageModel
from IPython.display import Markdown, display
import torch
import pandas as pd
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Load the Base model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.11.7: Fast Gemma2 patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


### Load Evaluation data


In [None]:
from datasets import load_dataset
import pandas as pd

#load finetuning data from HF
dataset = load_dataset("AI-MO/NuminaMath-CoT", split = "train")

#split data to train and validation
from sklearn.model_selection import train_test_split

train_valid_data=dataset.train_test_split(test_size=0.2, seed=42)
test_data=train_valid_data['test']
train_data=train_valid_data['train']

# select the subset used to evaluate the model
test_subset=train_data.select(range(5000))

In [None]:
eval_data=pd.DataFrame(test_subset)
eval_data.shape

(5000, 4)

### Evaluate the base model

In [None]:
prompt_template='''
You are a math assistant. Answer the following math problem with a detailed, step-by-step solution. Be clear and concise in each step. If there are multiple approaches, select the most efficient method. Include any formulas or key concepts used, and provide the final answer at the end.

Instruction: {problem} \n
Response: {response}
'''

In [None]:
FastLanguageModel.for_inference(model)
test_set= eval_data[:15]

for index,rows in test_set.iterrows():
    inputs=tokenizer([prompt_template.format(problem=rows['problem'],response='')], return_tensors='pt').to('cuda')

    generated=model.generate(**inputs,max_new_tokens=1024,use_cache=True)

    generated_response=tokenizer.decode(generated[0], skip_special_tokens=True)

    response=generated_response.split("Response:")[-1].strip()
    test_set.at[index ,'generated'] = response

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.at[index ,'generated'] = response


In [None]:
test_set.generated

## Cosine simmilarity evaluation

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode generated responses and actual answers
embeddings_gen = model.encode([test for test in test_set.generated], convert_to_tensor=True)
embeddings_act = model.encode([test for test in test_set.solution], convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.cos_sim(embeddings_gen, embeddings_act)

# Evaluate based on a similarity threshold
correct = 0
for i in range(len(cosine_scores)):
    if cosine_scores[i][i] > 0.7:  # Compare each generated response to its corresponding actual answer
        correct += 1

accuracy = correct / len(test_set)
print(f'Semantic Similarity Accuracy: {accuracy:.2f}')


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Semantic Similarity Accuracy: 0.40


The base model score an accuracy of 40% on cosine simmilarity evaluation

In [None]:
correct,len(test_set)  #it was able to match 6 out of 15 with the ground truth answers

(6, 15)

## Rouge Score Evaluation

In [None]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=False)

rouge_l_scores=[]
# Iterate over test set rows
for idx, row in test_set.iterrows():

    # Calculate ROUGE score between generated response and actual solution
    score = scorer.score(row['generated'], row['solution'])

    # Append only the ROUGE-L f-measure score to the list
    rouge_l_scores.append(score['rougeL'].fmeasure)

# Compute the average ROUGE-L score
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
print(f'Average ROUGE-L Score: {average_rouge_l:.2f}')

Average ROUGE-L Score: 0.26


Base model scores a 26% on rouge score evaluation

A bit skeptical with this evaluation technique since it checks number of overlapping n grams during evaluation

## Load finetuned model

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM ,AutoTokenizer

# load the configurations ,base model and the loara adapters from hugging face
config = PeftConfig.from_pretrained("Koomemartin/unsloth-gemma2-9b-version3-100k")
base_model = AutoModelForCausalLM.from_pretrained("unsloth/gemma-2-9b-bnb-4bit")

peftmodel = PeftModel.from_pretrained(base_model, "Koomemartin/unsloth-gemma2-9b-version3-100k")
tokenizer = AutoTokenizer.from_pretrained("Koomemartin/unsloth-gemma2-9b-version3-100k")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


## Evaluate the Mathlearn Model

In [None]:
inputs = tokenizer(
[
    prompt_template.format(
        problem="Sean adds up all the odd integers from 1 to 499, inclusive. Julie adds up all the integers from 1 to 300, inclusive. What is Sean's sum divided by Julie's sum?", # instruction
        response="",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = peftmodel.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024)

<bos>
You are a math assistant. Answer the following math problem with a detailed, step-by-step solution. Be clear and concise in each step. If there are multiple approaches, select the most efficient method. Include any formulas or key concepts used, and provide the final answer at the end.

Instruction: Sean adds up all the odd integers from 1 to 499, inclusive. Julie adds up all the integers from 1 to 300, inclusive. What is Sean's sum divided by Julie's sum? 

Response: 
Sean's sum is the sum of all odd integers from 1 to 499, inclusive. We can use the formula for the sum of an arithmetic series to find Sean's sum:

Sum = (n/2) * (first term + last term)

In this case, n = 250 (since there are 250 odd integers from 1 to 499), the first term is 1, and the last term is 499. Plugging these values into the formula, we get:

Sean's sum = (250/2) * (1 + 499) = 125 * 500 = 62500

Julie's sum is the sum of all integers from 1 to 300, inclusive. We can use the same formula for the sum of an

In [None]:
test_set_finetuned= eval_data[:15]

for index,rows in test_set_finetuned.iterrows():
    inputs=tokenizer([prompt_template.format(problem=rows['problem'],response='')], return_tensors='pt').to('cuda')

    generated=peftmodel.generate(**inputs,max_new_tokens=1024,use_cache=True)

    generated_response=tokenizer.decode(generated[0], skip_special_tokens=True)

    response=generated_response.split("Response:")[-1].strip()
    test_set_finetuned.at[index ,'generated'] = response

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set_finetuned.at[index ,'generated'] = response


In [None]:
test_set_finetuned.generated

Unnamed: 0,generated
0,The function $f\left( x \right)=2\sin x-\cos x...
1,To show that \( 7p + 3^p - 4 \) is not a squar...
2,
3,The angle that has the same terminal side as $...
4,The given equation is $x^2 + c^2 = (a - x)^3$....
5,
6,The smallest angle $ t $ such that when $ r = ...
7,(A) \( 72 \% \)\n(B) \( 40 \% \)\n(C) \( 32 \%...
8,The total cost of the items purchased by Thoma...
9,"To define a new operation $\star$, we have $\f..."


## Cosine simmilarity Evaluation

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode generated responses and actual answers
embeddings_gen = model.encode([test for test in test_set_finetuned.generated], convert_to_tensor=True)
embeddings_act = model.encode([test for test in test_set_finetuned.solution], convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.cos_sim(embeddings_gen, embeddings_act)

# Evaluate based on a similarity threshold
correct = 0
for i in range(len(cosine_scores)):
    if cosine_scores[i][i] > 0.7:  # Compare each generated response to its corresponding actual answer
        correct += 1

accuracy = correct / len(test_set_finetuned)
print(f'Semantic Similarity Accuracy: {accuracy:.2f}')


Semantic Similarity Accuracy: 0.60


In [None]:
correct,len(test_set_finetuned)

(9, 15)

The mathlearn model improves with a factor of 20% scoring a 60% on cosine similarity evaluation. It matches 9 out the 15 problem provided

## Rouge Score Evaluation

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=False)

rouge_l_scores=[]
# Iterate over test set rows
for idx, row in test_set_finetuned.iterrows():

    # Calculate ROUGE score between generated response and actual solution
    score = scorer.score(row['generated'], row['solution'])

    # Append only the ROUGE-L f-measure score to the list
    rouge_l_scores.append(score['rougeL'].fmeasure)

# Compute and print the average ROUGE-L score
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
print(f'Average ROUGE-L Score: {average_rouge_l:.2f}')

Average ROUGE-L Score: 0.27


Mathlearn also score 27% on the rouge score evaluation