In [1]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
from datasets import load_from_disk
os.environ['TRANSFORMERS_CACHE'] = '/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache'



In [2]:
data_feedback = load_from_disk("/scratch/workspace/wenlongzhao_umass_edu-analyze/datasets/gsm8k/feedback")
data_feedback

Dataset({
    features: ['question', 'answer'],
    num_rows: 5082
})

In [3]:
data_train = load_from_disk("/scratch/workspace/wenlongzhao_umass_edu-analyze/datasets/gsm8k/train")
data_train

Dataset({
    features: ['question', 'answer'],
    num_rows: 896
})

In [4]:
hf_token = "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"
model_name = "meta-llama/Llama-3.2-3B-Instruct"
config = AutoConfig.from_pretrained(model_name, token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, config=config,cache_dir='/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache')
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, config=config,cache_dir='/scratch/workspace/wenlongzhao_umass_edu-analyze/transformers_cache')
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
# model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

device(type='cuda')

In [5]:
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [6]:
data_train["question"][0]

'Bobby wanted pancakes for breakfast.  The recipe on the box makes 21 pancakes.  While he ate 5 pancakes, his dog jumped up and was able to eat 7 before being caught.  How many pancakes does Bobby have left?'

In [27]:
oneShotPrompt=f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnswer the below question:\n\nQ: {data_train["question"][0]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'

In [28]:
tokenized_inputs = tokenizer(oneShotPrompt, return_tensors="pt", padding=True, truncation=True)
tokenized_inputs.to(device)

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,   2675,    527,    264,
          11190,  18328, 128009, 128006,    882, 128007,    271,  16533,    279,
           3770,   3488,   1473,     48,     25,  38481,   4934,  80960,    369,
          17954,     13,    220,    578,  11363,    389,    279,   3830,   3727,
            220,   1691,  80960,     13,    220,   6104,    568,  30912,    220,
             20,  80960,     11,    813,   5679,  27096,    709,    323,    574,
           3025,    311,   8343,    220,     22,   1603,   1694,  10791,     13,
            220,   2650,   1690,  80960,   1587,  38481,    617,   2163,     30,
         128009, 128006,  78191, 128007]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1

In [29]:
with torch.no_grad():
    output = model.generate(**tokenized_inputs, max_new_tokens=256, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.1, top_p=0.95)

In [32]:
output=tokenizer.decode(output[0], skip_special_tokens=True)
print(output)

system

You are a helpful assistantuser

Answer the below question:

Q: Bobby wanted pancakes for breakfast.  The recipe on the box makes 21 pancakes.  While he ate 5 pancakes, his dog jumped up and was able to eat 7 before being caught.  How many pancakes does Bobby have left?assistant

To find out how many pancakes Bobby has left, we need to subtract the number of pancakes he ate and the number his dog ate from the total number of pancakes.

Total pancakes = 21
Pancakes eaten by Bobby = 5
Pancakes eaten by dog = 7

Total pancakes eaten = 5 + 7 = 12

Now, subtract the total pancakes eaten from the total number of pancakes:

21 - 12 = 9

Bobby has 9 pancakes left.


In [70]:
fewShotPrompt=oneShotPrompt=f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nBelow are few example question and answer pairs\n\n'

In [71]:
for i in range(9):
    fewShotPrompt+=f'Q: {data_train["question"][i]}\n\nA: {data_train["answer"][i]}\n\n'

In [72]:
fewShotPrompt+=f'Now, Follow the same format for reasoning and stating your final answer as above examples and Answer the below question\n\nQ: {data_train["question"][12]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'

In [73]:
tokenized_inputs = tokenizer(fewShotPrompt, return_tensors="pt", padding=True, truncation=True)
tokenized_inputs.to(device)
with torch.no_grad():
    output = model.generate(**tokenized_inputs, max_new_tokens=256, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.1, top_p=0.95)

In [74]:
output=tokenizer.decode(output[0], skip_special_tokens=True)
print(output)

system

You are a helpful assistantuser

Below are few example question and answer pairs

Q: Bobby wanted pancakes for breakfast.  The recipe on the box makes 21 pancakes.  While he ate 5 pancakes, his dog jumped up and was able to eat 7 before being caught.  How many pancakes does Bobby have left?

A: Bobby ate 5 pancakes and his dog ate 7 so 5+7 = <<5+7=12>>12
The recipe makes 21 pancakes and 12 were eaten so 21-12 = <<21-12=9>>9 pancakes were left
#### 9

Q: A store gives a 10% discount for the amount of the sell that was over $1000.  John buys 7 items that each cost $200.  What does his order cost after the discount?

A: His order came out to 7*200=$<<7*200=1400>>1400
So there was 1400-1000=$<<1400-1000=400>>400 that qualified for the discount
So his discount saved 400*.1=$<<400*.1=40>>40
So his purchase came out to 1400-40=$<<1400-40=1360>>1360
#### 1360

Q: An agricultural cooperative must ship 6500 kg of potatoes. During transport by truck, 150 kg are damaged and therefore canno

In [75]:
data_train["answer"][12]

'From the baby spoons, Lisa holds 4 children * 3 spoons each = <<4*3=12>>12 baby spoons.\nAdding this to the decorative spoons means Lisa has 12 baby spoons + 2 decorative spoons = <<12+2=14>>14 old spoons.\nIn the new set of cutlery, there is a total of 10 large spoons + 15 teaspoons = <<10+15=25>>25 new spoons.\nSo in total, Lisa has 14 old spoons + 25 new spoons = <<14+25=39>>39 spoons.\n#### 39'

/home/kchimmad_umass_edu


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [32]:
import pandas as pd

In [121]:
path = '/scratch/workspace/wenlongzhao_umass_edu-analyze/outputs/gsm8k/LLaMA1B/generated_outputs.json'
df = pd.read_json(path)

In [100]:
df.head()

Unnamed: 0,input,output
0,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...
1,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...
2,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...
3,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...
4,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...


In [101]:
def get_answer(row):
    # print(row)
    return row['output'].split('####')[-1].strip()

In [102]:
df['model_answer']=df.apply(get_answer,axis=1)

In [103]:
df.head()

Unnamed: 0,input,output,model_answer
0,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,108
1,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,7
2,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,25%
3,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,3
4,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,162


In [104]:
len(data_feedback["answer"])

5082

In [105]:
# !pip install evaluate

In [106]:
from evaluate import load
exact_match_metric = load("exact_match")

In [45]:
results = exact_match_metric.compute(predictions=[df.iloc[3]['model_answer']], references=[data_feedback["answer"][3].split("####")[-1]])

In [53]:
df.shape

(5082, 3)

In [51]:
results

{'exact_match': np.float64(1.0)}

In [82]:
feedback = pd.DataFrame(data_feedback)
feedback.head()

Unnamed: 0,question,answer
0,Jordan read 120 French novels last holiday. Hi...,Alexandre read 120 * 1/10 = <<120*1/10=12>>12 ...
1,Samantha’s last name has three fewer letters t...,"There are 4 letters in Jamie’s last name, so B..."
2,Anna is making gingerbread cookies. She gives ...,The first step to find the total number of gin...
3,Frank has three less than half the number of c...,Mike has 4 * 3 = <<4*3=12>>12 cookies.\nFrank ...
4,"Mr. Smith takes his wife, his parents, and his...",The buffet cost for Mr. Smith and his wife is ...


In [83]:
feedback['GT_Answer']=feedback.apply(get_answer,axis=1)

In [84]:
feedback.head(10)

Unnamed: 0,question,answer,GT_Answer
0,Jordan read 120 French novels last holiday. Hi...,Alexandre read 120 * 1/10 = <<120*1/10=12>>12 ...,108
1,Samantha’s last name has three fewer letters t...,"There are 4 letters in Jamie’s last name, so B...",7
2,Anna is making gingerbread cookies. She gives ...,The first step to find the total number of gin...,50
3,Frank has three less than half the number of c...,Mike has 4 * 3 = <<4*3=12>>12 cookies.\nFrank ...,3
4,"Mr. Smith takes his wife, his parents, and his...",The buffet cost for Mr. Smith and his wife is ...,159
5,Jill is going to resod her front yard. The plo...,The area of the whole front yard is 200 feet b...,9474
6,Janice opened an art book that she had found i...,"If the first page had 5 drawings, and the numb...",75
7,Bob started out the week with $80. On Monday a...,"On Monday, he spent half of $80 leaving him wi...",20
8,Will's breakfast supplied him 900 calories of ...,Will jogged for 60/2 = <<60/2=30>>30 minutes.\...,600
9,Lisa has 36 candies. On Mondays and Wednesdays...,"On Monday and Tuesday, she eats 2 * 2 = <<2*2=...",4


In [67]:
score= exact_match_metric.compute(predictions=df['model_answer'].tolist(), references=feedback["GT_Answer"].tolist())

In [107]:
df['score'] = (df['model_answer'] == feedback['GT_Answer']).astype(int)

In [88]:
from tqdm import tqdm
for i in tqdm(range(10)):
    

100%|██████████| 10/10 [00:00<00:00, 50.81it/s]


In [108]:
df.head(10)

Unnamed: 0,input,output,model_answer,score
0,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,108,1
1,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,7,1
2,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,25%,0
3,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,3,1
4,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,162,0
5,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,9474,1
6,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,75,1
7,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,28,0
8,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,600,1
9,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,4,1


In [109]:
df.drop(columns=['model_answer'],inplace=True)

In [110]:
df.head()

Unnamed: 0,input,output,score
0,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,1
1,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,1
2,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,0
3,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,1
4,<|begin_of_text|><|start_header_id|>system<|en...,system\n\nYou are a helpful assistantuser\n\nB...,0


In [111]:
df.to_json("/scratch/workspace/wenlongzhao_umass_edu-analyze/outputs/gsm8k/LLaMA3B/generated_outputs1.json", orient='records', lines=True)

In [115]:
records = df.to_dict(orient='records')

In [116]:
import json
formatted_json = json.dumps(records, indent=4)


In [118]:
with open("/scratch/workspace/wenlongzhao_umass_edu-analyze/outputs/gsm8k/LLaMA3B/generated_outputs1.json", "w") as f:
    f.write(formatted_json)

In [122]:
df['score'].sum()

np.int64(1798)