In [1]:
%load_ext autoreload
%autoreload 2

# Add the path to the parent directory to sys
import sys, os

# If current directory is called 'notebooks', chdir to the parent
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')
    
sys.path.append('attribution')

from torch.utils.data import DataLoader

import pandas as pd
from constants import ModelNames
from model_utils import Model
from dataset_utils import GSM8kDataset, PaddingCollator

In [2]:
# print pwd
print(os.getcwd())

c:\Dev Projects\RUG MSC AI 2024\Advanced Topics in NLP\IKNLP-Attribution


In [3]:
# Create a model instance
model = Model(ModelNames.QwenInstruct)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [4]:
# Create a training dataset
train_dataset = GSM8kDataset(model.tokenizer, split='train')
print(f"Training dataset size: {len(train_dataset)}")

Training dataset size: 7473


In [5]:
# Get a single example
sample = train_dataset[0]
print(f"Question: {sample['question']}")

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?


In [6]:
# View the generated instructions
print(train_dataset.instructions)

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Answer: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10

Question: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
Answer: In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.
Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.
This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50

In [7]:
# Create a test dataset using the same instructions
test_dataset = GSM8kDataset(model.tokenizer, instructions=train_dataset.instructions, split='test')
print(f"Test dataset size: {len(test_dataset)}")

Test dataset size: 1319


In [8]:
# Create a DataLoader for batch processing with padding collator
padding_collator = PaddingCollator(model.tokenizer)

import time
import numpy as np

test_batch_size = 1
sample = train_dataset[0]
num_runs = 5

sample_loader = DataLoader([sample], batch_size=1, collate_fn=padding_collator)

# Run multiple times to get average performance
times = []
for _ in range(num_runs):
    start = time.time()
    output = model.generate_responses(sample_loader)
    times.append(time.time() - start)

# Get final output and token count
num_tokens = len(model.tokenizer.tokenize(output[0]))
mean_time = np.mean(times)
tokens_per_second = num_tokens / mean_time

print(f"Mean inference time over {num_runs} runs: {mean_time:.4f} seconds")
print(f"Number of tokens in output: {num_tokens}")
print(f"Tokens per second: {tokens_per_second:.2f}")
print(output)

Generating batches: 100%|██████████| 1/1 [00:03<00:00,  3.44s/it]
Generating batches: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
Generating batches: 100%|██████████| 1/1 [00:02<00:00,  2.37s/it]
Generating batches: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]
Generating batches: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it]

Mean inference time over 5 runs: 2.6326 seconds
Number of tokens in output: 59
Tokens per second: 22.41
['Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72']





In [12]:
# Be careful with the batch size
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=padding_collator)

# Generate responses using the model
generations = model.generate_responses(test_dataloader)
print(generations[:5])

Generating batches: 100%|██████████| 42/42 [19:49<00:00, 28.33s/it]

[" Janet's ducks lay 16 eggs per day.\nShe eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left.\nShe bakes muffins with 4 eggs, so she has 13 - 4 = 9 eggs left.\nShe sells the remaining eggs at $2 per egg, so she makes 9 * $2 = $18 every day.\nThe answer is $\\boxed{18}$.", ' It takes 2 bolts of blue fiber.\nAnd it takes half that amount of white fiber, so it takes 2/2 = 1 bolt of white fiber.\nTo find the total number of bolts needed, we add the number of bolts of blue fiber and the number of bolts of white fiber together.\nSo, the total number of bolts is 2 (blue) + 1 (white) = 3 bolts.\nThe answer is $\\boxed{3}$.', ' The value of the house after repairs is $80,000 + $50,000 = $130,000.\nThe increase in value is 150% of the original price, so it is 150/100 * $80,000 = $120,000.\nAdding this increase to the original price gives us a final price of $80,000 + $120,000 = $200,000.\nTo find the profit, we subtract the cost from the final price, so the profit is $200,000 - $80,000




In [46]:
# Save the generations to a CSV file
df = pd.DataFrame(generations, columns=['response'])
df.to_csv('results/gsm8k_generations.csv', index=False)

In [35]:
# Load the generations from the CSV file
df = pd.read_csv('results/gsm8k_generations.csv')
generations = df['response'].tolist()

In [40]:
# Evaluate model accuracy on GSM8k
from dataset_utils import is_correct_gsm8k, extract_answer_gsm8k

# Get ground truth answers
gt_answers = [sample['answer'] for sample in test_dataset]

# Calculate correct predictions
correct = 0
for pred, gt in zip(generations, gt_answers):
    if is_correct_gsm8k(pred, {'answer': gt}):
        correct += 1

# Calculate accuracy
accuracy = correct / len(test_dataset)
print(f"GSM8k Accuracy: {accuracy:.4f} ({correct}/{len(test_dataset)})")

# Show some example predictions
print("\nExample predictions:")
for i in range(3):  # Show first 3 examples
    print(f"\nQuestion: {test_dataset[i]['question']}")
    print(f"Generated answer: {generations[i]}")
    print(f"Extracted generated answer: {extract_answer_gsm8k(generations[i])}")
    print(f"Extracted ground truth: {extract_answer_gsm8k(test_dataset[i]['answer'])}")

GSM8k Accuracy: 0.5739 (757/1319)

Example predictions:

Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Generated answer:  Janet's ducks lay 16 eggs per day.
She eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left.
She bakes muffins with 4 eggs, so she has 13 - 4 = 9 eggs left.
She sells the remaining eggs at $2 per egg, so she makes 9 * $2 = $18 every day.
The answer is $\boxed{18}$.
Extracted generated answer: 18.0
Extracted ground truth: 18.0

Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
Generated answer:  It takes 2 bolts of blue fiber.
And it takes half that amount of white fiber, so it takes 2/2 = 1 bolt of white fiber.
To find the total number of bolts needed, w