In [10]:
%load_ext autoreload
%autoreload 2

# Add the path to the parent directory to sys
import sys, os

# If current directory is called 'notebooks', chdir to the parent
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')
    
sys.path.append('attribution')

from torch.utils.data import DataLoader

import pandas as pd
from constants import ModelNames
from model_utils import Model
from dataset_utils import GSM8kDataset, PaddingCollator

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# print pwd
print(os.getcwd())

c:\Dev Projects\RUG MSC AI 2024\Advanced Topics in NLP\IKNLP-Attribution


In [12]:
# Create a model instance
model = Model(ModelNames.QwenInstruct)

In [13]:
# Create a training dataset
train_dataset = GSM8kDataset(model.tokenizer, split='train')
print(f"Training dataset size: {len(train_dataset)}")

Training dataset size: 7473


In [14]:
# Get a single example
sample = train_dataset[0]
print(f"Question: {sample['question']}")

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?


In [15]:
# View the generated instructions
print(train_dataset.instructions)

Answer the following questions and think step by step.
List out your answer followed by #### on a new line.
Here are some examples:
Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Answer: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10

Question: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
Answer: In the beginning, Betty has only 100 

In [16]:
# Create a test dataset using the same instructions
test_dataset = GSM8kDataset(model.tokenizer, instructions=train_dataset.instructions, split='test')
print(f"Test dataset size: {len(test_dataset)}")

Test dataset size: 1319


In [17]:
# Create a DataLoader for batch processing with padding collator
padding_collator = PaddingCollator(model.tokenizer)

import time
import numpy as np

test_batch_size = 1
sample = train_dataset[0]
num_runs = 5

sample_loader = DataLoader([sample], batch_size=1, collate_fn=padding_collator)

# Run multiple times to get average performance
times = []
for _ in range(num_runs):
    start = time.time()
    output = model.generate_responses(sample_loader)
    times.append(time.time() - start)

# Get final output and token count
num_tokens = len(model.tokenizer.tokenize(output[0]))
mean_time = np.mean(times)
tokens_per_second = num_tokens / mean_time

print(f"Mean inference time over {num_runs} runs: {mean_time:.4f} seconds")
print(f"Number of tokens in output: {num_tokens}")
print(f"Tokens per second: {tokens_per_second:.2f}")
print(output)

Generating batches:   0%|          | 0/1 [00:02<?, ?it/s]
Generating batches:   0%|          | 0/1 [00:02<?, ?it/s]
Generating batches:   0%|          | 0/1 [00:02<?, ?it/s]
Generating batches:   0%|          | 0/1 [00:04<?, ?it/s]
Generating batches:   0%|          | 0/1 [00:02<?, ?it/s]

Mean inference time over 5 runs: 2.7498 seconds
Number of tokens in output: 59
Tokens per second: 21.46
['Natalia sold 48/2=<<48/2=24>>24 clips in May.\nNatalia sold 48+24=<<48+24=72>>72 clips altogether in April and May.\n#### 72']





In [18]:
# Be careful with the batch size
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=padding_collator)

# Generate responses using the model
generations = model.generate_responses(test_dataloader)
print(generations[:5])

Generating batches:   0%|          | 0/42 [00:52<?, ?it/s]

["She starts with 16 because she gets 16 eggs a day.\n\nShe keeps 3 for herself every morning so she has 16 - 3 = 13 eggs left.\n\nShe uses 4 muffin recipes from them so she has 13 - 4 = 9 eggs left to sell.\n\nAt $2 a dozen (which is 12), she makes 2 * 9 = $18 making a profit of $18 every day at the farmers' market.\n#### 18", 'To find out how many bolts of fiber the robe takes, we first calculate the amount of white fiber needed based on the information given.\n\nIt takes 2 bolts of blue fiber, and this corresponds to half the amount of white fiber required. So, if 2 bolts of blue fiber represent half the amount of white fiber, then:\n\n\\[ \\text{Amount of white fiber} = 2 \\times 2 = 4 \\text{ bolts} \\]\n\nNow, to find the total number of bolts used for both colors, we add the blue fibers together with the white fibers:\n\n\\[ \\text{Total bolts} = 2 (\\text{blue}) + 4 (\\text{white}) = 6 \\text{ bolts} \\]\n\nTherefore, the robe takes 6 bolts of fiber in total.', 'He spent 50,000




In [None]:
# Save the generations to a CSV file
df = pd.DataFrame(generations, columns=['response'])
df.to_csv('results/gsm8k_generations.csv', index=False)

In [14]:
# Evaluate model accuracy on GSM8k
from dataset_utils import is_correct_gsm8k, extract_answer_gsm8k

# Get ground truth answers
gt_answers = [sample['answer'] for sample in test_dataset]

# Calculate correct predictions
correct = 0
for pred, gt in zip(generations, gt_answers):
    if is_correct_gsm8k(pred, {'answer': gt}):
        correct += 1

# Calculate accuracy
accuracy = correct / len(test_dataset)
print(f"GSM8k Accuracy: {accuracy:.4f} ({correct}/{len(test_dataset)})")

# Show some example predictions
print("\nExample predictions:")
for i in range(3):  # Show first 3 examples
    print(f"\nQuestion: {test_dataset[i]['question']}")
    print(f"Generated answer: {generations[i]}")
    print(f"Extracted generated answer: {extract_answer_gsm8k(generations[i])}")
    print(f"Extracted ground truth: {extract_answer_gsm8k(test_dataset[i]['answer'])}")

GSM8k Accuracy: 0.1850 (244/1319)

Example predictions:

Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Generated answer:  for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
assistant
 Janet's ducks lay 16 eggs per day.
She eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left.
She bakes muffins with 4 eggs, so she has 13 - 4 = 9 eggs left.
She sells the remaining eggs at $2 per dozen (12 eggs), so she makes 9 / 12 * $2 = $1.50 from selling the eggs.
Therefore, she makes $1.50 every day at the farmers' market.
#### 1.50
Extracted generated answer: 1.50
Extracted ground truth: 18

Question: A robe takes 2 bolt