In [1]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
from datasets import load_from_disk
os.environ['TRANSFORMERS_CACHE'] = '../transformers_cache/'


In [2]:
data_test = load_from_disk("../datasets/gsm8k/test/")
data_test

Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})

In [3]:
data_train = load_from_disk("../datasets/gsm8k/train/")
data_train

Dataset({
    features: ['question', 'answer'],
    num_rows: 896
})

In [4]:
hf_token = os.getenv("hf_token")
model_name = "meta-llama/Llama-3.2-3B-Instruct"
config = AutoConfig.from_pretrained(model_name, token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, config=config,cache_dir='../transformers_cache')
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, config=config,cache_dir='../transformers_cache')
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
# model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

device(type='cuda')

In [None]:
model.to(device)

In [None]:
# 8-shot prompt
fewShotPrompt=f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n'
'\n\nBelow are few example question and answer pairs\n\n'

fewShotPrompt += "Here your job is to answer a math question. "
fewShotPrompt += f"Your question will appear after 8 demonstrations of similar math tasks. "
fewShotPrompt += "As in those demonstrations, you must generate both a step-by-step reasoning and a final answer. "
fewShotPrompt += "Perform a simple action, e.g., a single mathematical operation, at each step of your reasoning. "
fewShotPrompt += "Your final answer must contain only a number and no additional text. "
fewShotPrompt += "State your final answer after ####. "

print(fewShotPrompt)

In [None]:
for i in range(9):
    fewShotPrompt+=f'Q: {data_train["question"][i]}\nA: {data_train["answer"][i]}\n\n'

In [None]:
model.eval()
generated_outputs=[]
# Adjust batch size according to your GPU memory capacity
# With vram=80G, 1B - 64, 3B - 64, 8B - 32
batch_size=64
for i in tqdm(range(0, len(data_test["question"]), batch_size), desc="Processing questions"):
    batch_questions = data_test["question"][i:i+batch_size]
    inputs = [fewShotPrompt+"Now, solve the below question following the instructions given above. \n\nQ: "+q+"\nA: <|eot_id|><|start_header_id|>assistant<|end_header_id|>" for q in batch_questions]
    # inputs = [fewShotPrompt+"Now, Follow the same format for reasoning and stating your final answer as above examples and Answer the below question\n\nQ: "+q+"<|eot_id|><|start_header_id|>assistant<|end_header_id|>" for q in batch_questions]
    tokenized_inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
    tokenized_inputs.to(device)

    with torch.no_grad():
        output = model.generate(**tokenized_inputs, max_length=2000, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.1, top_p=0.95)
        # output = model.generate(**tokenized_inputs, max_new_tokens=256, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.1, top_p=0.95)
   
    for j, o in enumerate(output):
        generated_text = tokenizer.decode(o, skip_special_tokens=True)
        generated_outputs.append({"input": inputs[j], "output": generated_text})