In [5]:
%load_ext autoreload
%autoreload 2

# Add the path to the parent directory to sys
import sys, os

# If current directory is called 'notebooks', chdir to the parent
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')
    
sys.path.append('attribution')

from torch.utils.data import DataLoader

import pandas as pd
from constants import ModelNames, DatasetNames, LANGUAGE_MAPPING
from model_utils import Model
from dataset_utils import GSMDataset, PaddingCollator, is_correct_gsm, extract_answer_gsm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# print pwd
print(os.getcwd())

c:\Dev Projects\RUG MSC AI 2024\Advanced Topics in NLP\IKNLP-Attribution


In [13]:
# Create a model instance
model = Model(ModelNames.QwenInstruct)

Device set to use cuda:0


In [8]:
# Create a training dataset
train_dataset = GSMDataset(DatasetNames.MGSM, model.tokenizer, config='en')
print(f"Training dataset size: {len(train_dataset)}")

Training dataset size: 8


In [9]:
# Get a single example
sample = train_dataset[0]
print(f"{sample['question']}")

Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?


In [10]:
# View the generated instructions
print(train_dataset.instructions)

Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
Step-by-Step Answer:
- Roger started with 5 balls.
- 2 cans of 3 tennis balls each is 6 tennis balls.
- 5 + 6 = 11.
The answer is 11.

Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
Step-by-Step Answer:
- There are 4 days from monday to thursday.
- 5 computers were added each day.
- That means in total 4 * 5 = 20 computers were added.
- There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers.
The answer is 29.

Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
Step-by-Step Answer:
- Leah had 32 chocolates and Leah’s sister had 42.
- That means there were originally 32 + 42 = 74 chocolates.
- 35 have been eaten.
- So in total they stil

In [11]:
# Create a test dataset using the same instructions
test_dataset = GSMDataset(DatasetNames.MGSM, model.tokenizer, instructions=train_dataset.instructions, split='test', config='en')
print(f"Test dataset size: {len(test_dataset)}")

Test dataset size: 250


In [14]:
# Create a DataLoader for batch processing with padding collator
padding_collator = PaddingCollator(model.tokenizer)

import time
import numpy as np

sample = train_dataset[0]
num_runs = 1

sample_loader = DataLoader([sample], batch_size=1, collate_fn=padding_collator)

# Run multiple times to get average performance
times = []
for _ in range(num_runs):
    start = time.time()
    output = model.generate_responses(sample_loader)
    times.append(time.time() - start)

# Get final output and token count
num_tokens = len(model.tokenizer.tokenize(output[0]))
mean_time = np.mean(times)
tokens_per_second = num_tokens / mean_time

print(f"Mean inference time over {num_runs} runs: {mean_time:.4f} seconds")
print(f"Number of tokens in output: {num_tokens}")
print(f"Tokens per second: {tokens_per_second:.2f}")

print(sample['question'])
print(output)

Generating batches: 100%|██████████| 1/1 [00:07<00:00,  7.65s/it]

Mean inference time over 1 runs: 7.6519 seconds
Number of tokens in output: 79
Tokens per second: 10.32
Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
['Step-by-step answer:\n- Roger starts with 5 tennis balls.\n- He buys 2 more cans of tennis balls, each containing 3 tennis balls.\n- Therefore, he gets an additional \\(2 \\times 3 = 6\\) tennis balls.\n- Now, Roger has \\(5 + 6 = 11\\) tennis balls in total.\nThe answer is 11.<']





In [12]:
# Be careful with the batch size
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=padding_collator)

# Generate responses using the model
generations = model.generate_responses(test_dataloader)
print(generations[:5])

Generating batches: 100%|██████████| 42/42 [19:49<00:00, 28.33s/it]

[" Janet's ducks lay 16 eggs per day.\nShe eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left.\nShe bakes muffins with 4 eggs, so she has 13 - 4 = 9 eggs left.\nShe sells the remaining eggs at $2 per egg, so she makes 9 * $2 = $18 every day.\nThe answer is $\\boxed{18}$.", ' It takes 2 bolts of blue fiber.\nAnd it takes half that amount of white fiber, so it takes 2/2 = 1 bolt of white fiber.\nTo find the total number of bolts needed, we add the number of bolts of blue fiber and the number of bolts of white fiber together.\nSo, the total number of bolts is 2 (blue) + 1 (white) = 3 bolts.\nThe answer is $\\boxed{3}$.', ' The value of the house after repairs is $80,000 + $50,000 = $130,000.\nThe increase in value is 150% of the original price, so it is 150/100 * $80,000 = $120,000.\nAdding this increase to the original price gives us a final price of $80,000 + $120,000 = $200,000.\nTo find the profit, we subtract the cost from the final price, so the profit is $200,000 - $80,000




In [46]:
# Save the generations to a CSV file
df = pd.DataFrame(generations, columns=['response'])
df.to_csv('results/gsm8k_generations.csv', index=False)

In [54]:
# Load the generations from the CSV file
df = pd.read_csv('results/gsm8k_generations.csv')
generations = df['response'].tolist()

In [57]:
# Evaluate model accuracy on GSM8k
from dataset_utils import is_correct_gsm, extract_answer_gsm

# Get ground truth answers
gt_answers = [sample['answer'] for sample in test_dataset]

# Calculate correct predictions
correct = 0
for pred, gt in zip(generations, gt_answers):
    if is_correct_gsm(pred, gt):
        correct += 1

# Calculate accuracy
accuracy = correct / len(test_dataset)
print(f"GSM8k Accuracy: {accuracy:.4f} ({correct}/{len(test_dataset)})")

# Show some example predictions
print("\nExample predictions:")
for i in range(3):  # Show first 3 examples
    print(f"\nQuestion: {test_dataset[i]['question']}")
    print(f"Generated answer: {generations[i]}")
    print(f"Extracted generated answer: {extract_answer_gsm(generations[i])}")
    print(f"Extracted ground truth: {extract_answer_gsm(test_dataset[i]['answer'])}")

GSM8k Accuracy: 0.5739 (757/1319)

Example predictions:

Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Generated answer:  Janet's ducks lay 16 eggs per day.
She eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left.
She bakes muffins with 4 eggs, so she has 13 - 4 = 9 eggs left.
She sells the remaining eggs at $2 per egg, so she makes 9 * $2 = $18 every day.
The answer is $\boxed{18}$.
Extracted generated answer: 18.0
Extracted ground truth: 18.0

Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
Generated answer:  It takes 2 bolts of blue fiber.
And it takes half that amount of white fiber, so it takes 2/2 = 1 bolt of white fiber.
To find the total number of bolts needed, w

In [2]:
# Load MGSM
mgsm_test = GSMDataset(DatasetNames.MGSM, model.tokenizer, instructions='', split='test', config='en')

# Load generations from mgsm_en_Qwen2-1.5B-Instruct_results
df = pd.read_csv('results/mgsm_en_Qwen2-1.5B-Instruct_results.csv')
mgsm_generations = df['response'].tolist()

NameError: name 'GSMDataset' is not defined

In [13]:
# Get ground truth answers
mgsm_gt_answers = [sample for sample in mgsm_test.dataset['answer_number']]

# Calculate correct predictions
correct = 0
for pred, gt in zip(mgsm_generations, mgsm_gt_answers):
    if extract_answer_gsm(pred) == gt:
        correct += 1
        
# Calculate accuracy
accuracy = correct / len(mgsm_test)
print(f"MGSM Accuracy: {accuracy:.4f} ({correct}/{len(mgsm_test)})")

MGSM Accuracy: 0.5760 (144/250)


In [8]:
import datasets
import random

answers = {}

for key in LANGUAGE_MAPPING.keys():
    mgsm_multilingual = datasets.load_dataset('juletxara/mgsm', key)
    # random index
    random_index = random.randint(0, len(mgsm_multilingual['train']) - 1)
    first_answer = mgsm_multilingual['train'][random_index]['answer']
    answers[key] = first_answer
    
# Print the answers
for lang, answer in answers.items():
    print(f"{answer}")

ধাপে ধাপে উত্তর: সোমবার থেকে বৃহস্পতিবার 4দিন হয়। প্রতিদিন 5টি করে কম্পিউটার যোগ করা হয়েছে। যার অর্থ মোট 4 * 5 = 20টি কম্পিউটার যোগ করা হয়েছে। শুরুতে 9টি কম্পিউটার ছিল, তাই এখন 9 + 20 = 29টি কম্পিউটার রয়েছে। উত্তর হল 29।
Schritt-für-Schritt-Antwort: Michael hatte anfangs 58 Golfbälle und hat 23 verloren, sodass er 58 - 23 = 35 hat. Nachdem er 2 weitere verloren hat, hat er jetzt 35 - 2 = 33 Bälle. Die Antwort lautet 33.
Step-by-Step Answer: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.
Respuesta paso a paso: Tiene 5 juguetes. Recibió 2 de la mamá, por lo que después de eso tiene 5 + 2 = 7 juguetes. Luego, recibió 2 más del papá, así que en total tiene 7 + 2 = 9 juguetes. La respuesta es 9.
Réponse étape par étape : 5 bagels à 3 $ chacun coûtent 5 x 3 = 15 dollars. Olivia avait 23 dollars au départ, il lui reste donc 23 - 15 = 8 dollars. La réponse est 8.
ステップごとの答え：月曜から木曜まで4日あります。毎日5台のコンピューターが追加されます。つまり、全部で4*5=20台のコンピューターが追加され