In [1]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
from datasets import load_from_disk
from vllm.inputs import TokensPrompt
from vllm import LLM, SamplingParams
from tqdm import tqdm
from small_llm_reasoning.evaluation.gsm8k import get_score, eight_shot_messages
# from small_llm_reasoning.generation.vllm_generation import llama_forward


cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache/'
os.environ['TRANSFORMERS_CACHE'] = cache_dir


  from .autonotebook import tqdm as notebook_tqdm
2025-03-05 21:47:17,832	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
# Loading data
data_path= "../datasets/gsm8k"
data = load_from_disk(f'{data_path}/train/')
data

Dataset({
    features: ['question', 'answer'],
    num_rows: 5473
})

In [3]:
# Loading model
hf_token = os.getenv("hf_token")

# model_name= "meta-llama/Llama-3.2-1B-Instruct"
model_name= "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id



In [4]:
def get_prompt(ex, few_shot):
    
    prompt = [
        {
            'role': 'user',
            'content': f'Given the following problem, reason and give a final answer to the problem.\nProblem: {ex}\nYour response should end with "The final answer is [answer]" where [answer] is the response to the problem.\n'
        }
    ]
    if few_shot:
        prompt = eight_shot_messages + prompt
    return prompt

def tokenize_function(example,few_shot):
    prompt= get_prompt(example['question'], few_shot)
    prompt= tokenizer.apply_chat_template(prompt,  tokenize= False, add_generation_prompt=True)
    return {'input_ids': {'prompt_token_ids':tokenizer(prompt, add_special_tokens=False)['input_ids']}}


In [5]:
# tokenized_dataset = data.map(tokenize_function, batched=False)
tokenized_dataset = data.map(lambda x: tokenize_function(x,few_shot=False), batched=False)


Map: 100%|██████████| 5473/5473 [00:03<00:00, 1443.30 examples/s]


In [6]:
len(tokenized_dataset['input_ids'])

5473

In [7]:
tokenized_dataset.save_to_disk(f"{data_path}/tokenized/LLaMA3B/train/zero-shot/")

Saving the dataset (1/1 shards): 100%|██████████| 5473/5473 [00:00<00:00, 147333.39 examples/s]


In [4]:
data= load_from_disk(f"{data_path}/LLaMA3B/test/zero-shot/")

FileNotFoundError: Directory ../datasets/gsm8k/LLaMA3B/test/zero-shot/ not found

In [13]:
type(data['input_ids'])

list

In [6]:
# tokenized_prompts=[]
# for i in tqdm(range(len(data)), desc='preparing.....'):
#     tokenized_prompt = TokensPrompt(prompt_token_ids=data['input_ids'][i])
#     tokenized_prompts.append(tokenized_prompt)
# # tokenized_prompts
    

In [7]:
# stop_strings =  ['<|eot_id|>', '<|start_header_id|>user<|end_header_id|>', 'Q:', '</s>', '<|im_end|>']

sampling_params = SamplingParams(n=1,
                                 temperature=0,
                                 max_tokens=500,
                                 # stop=stop_strings,
                                 seed=1)

model = LLM(
        model=model_name, 
        # tokenizer=model_name, 
        tensor_parallel_size=1) 



INFO 02-24 17:04:15 config.py:999] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 02-24 17:04:15 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.46it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00,  2.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00,  2.08it/s]


INFO 02-24 17:04:18 model_runner.py:926] Loading model weights took 6.0160 GB





INFO 02-24 17:04:18 gpu_executor.py:122] # GPU blocks: 19082, # CPU blocks: 2340
INFO 02-24 17:04:21 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-24 17:04:21 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-24 17:04:30 model_runner.py:1335] Graph capturing finished in 10 secs.


In [14]:
outputs = model.generate(data['input_ids'], sampling_params)
# outputs = model.generate(['what is the capital of India','Where is Delhi'], sampling_params)

Processed prompts: 100%|██████████| 1319/1319 [00:39<00:00, 33.55it/s, est. speed input: 4519.24 toks/s, output: 6232.58 toks/s]


In [15]:
generated_outputs=[]
for out in outputs:
    generated_outputs.append({
            "input": tokenizer.decode(out.prompt_token_ids, skip_special_tokens=False), 
            "output": [
                ith_output.text for ith_output in out.outputs
            ]    
        })
    
    break

In [16]:
generated_outputs

[{'input': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 24 Feb 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following problem, reason and give a final answer to the problem.\nProblem: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers\' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers\' market?\nYour response should end with "The final answer is [answer]" where [answer] is the response to the problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
  'output': ["To find out how much Janet makes every day at the farmers' market, we need to first calculate the total number of eggs she lays and then subtract the eggs she eats and bakes.\n\nJanet's ducks lay 16 eggs per day. She eats 3 eggs for breakfast, so she h

In [None]:
generated_outputs1

In [60]:
inp=tokenizer('prompt', add_special_tokens=True)['input_ids']
inp

[128000, 41681]

In [61]:
tokenizer.decode(inp, skip_special_tokens=False)

'<|begin_of_text|>prompt'

In [25]:
for ex_outputs in all_outputs:
    print(ex_outputs.prompt)
    print(ex_outputs.outputs[0].text)

None
Janet has 16 eggs per day. She eats 3 for breakfast, so she has 16 - 3 = 13 eggs left. She bakes muffins for 4, so she has 13 - 4 = 9 eggs left. She sells 9 eggs at $2 each, so she makes 9 x 2 = 18 dollars per day. The final answer is 18


In [1]:
from vllm import LLM, SamplingParams
from vllm.inputs import TokensPrompt

# Initialize the LLM
llm = LLM(model="facebook/opt-125m")

# Get the tokenizer
tokenizer = llm.get_tokenizer()

# Define your input strings
input_strings = [
    "Translate the following English text to French: 'Hello, how are you?'",
    "What is the capital of France?",
    "Explain the concept of machine learning in simple terms."
]

# Tokenize the input strings and create TokensPrompt instances
tokenized_prompts = [
    TokensPrompt(prompt_token_ids=tokenizer.encode(input_string))
    for input_string in input_strings
]

# Define sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)

# Generate the outputs for the batch
outputs = llm.generate(tokenized_prompts, sampling_params)

# Print the results
for input_string, output in zip(input_strings, outputs):
    generated_text = output.outputs[0].text
    print(f"Input: {input_string}")
    print(f"Generated text: {generated_text}")
    print("---")


  from .autonotebook import tqdm as notebook_tqdm
2025-02-24 16:02:37,552	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 02-24 16:02:38 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=facebook/opt-125m, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_caching=False, use_async_output_proc=True)
INFO 02-24 1

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
  state = torch.load(bin_file, map_location="cpu")
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.05it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.01it/s]



INFO 02-24 16:02:40 model_runner.py:926] Loading model weights took 0.2389 GB
INFO 02-24 16:02:40 gpu_executor.py:122] # GPU blocks: 71292, # CPU blocks: 7281
INFO 02-24 16:02:42 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-24 16:02:42 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-24 16:02:49 model_runner.py:1335] Graph capturing finished in 7 secs.


Processed prompts: 100%|██████████| 3/3 [00:00<00:00, 21.75it/s, est. speed input: 268.33 toks/s, output: 1087.78 toks/s]

Input: Translate the following English text to French: 'Hello, how are you?'
Generated text: 

It is important to note that we speak French with two genders: on the one hand you have the masculine (Latin for "fellow", and the feminine "other") and on the other you have the feminine "speaker". Therefore,
---
Input: What is the capital of France?
Generated text: 
The capital of France. It's the second largest country after Belgium and is the second biggest economy in the world. It's also the third largest nation in the world and it's the third largest country in the world.
> it's the
---
Input: Explain the concept of machine learning in simple terms.
Generated text: 

In the past, a scientific exploration of machine learning used to be regarded as an antiquated notion. It was not the first time that a researcher had to explain how an algorithm could be used to generate an algorithm.

However, in
---





In [2]:
tokenized_prompts

[{'prompt_token_ids': [2,
   19163,
   19593,
   5,
   511,
   2370,
   2788,
   7,
   1515,
   35,
   128,
   31414,
   6,
   141,
   32,
   47,
   6600]},
 {'prompt_token_ids': [2, 2264, 16, 5, 812, 9, 1470, 116]},
 {'prompt_token_ids': [2,
   43043,
   1851,
   5,
   4286,
   9,
   3563,
   2239,
   11,
   2007,
   1110,
   4]}]

In [8]:
import os
import re

def natural_sort_key(checkpoint_path):
    """Extracts numerical part from checkpoint names for correct sorting."""
    match = re.search(r"checkpoint-(\d+)", checkpoint_path)
    return int(match.group(1)) if match else float('inf')  # Send non-matching to the end


In [11]:
model_path = '../outputs/exp-1.11/checkpoints/'

# Iterate over all checkpoints inside model_path
# checkpoints = sorted([os.path.join(model_path, d) for d in os.listdir(model_path) if os.path.isdir(os.path.join(model_path, d))])
checkpoints = [os.path.join(model_path, d) for d in os.listdir(model_path) if os.path.isdir(os.path.join(model_path, d))]

# Sort numerically by extracting the number in "checkpoint-XXXX"
checkpoints = sorted(checkpoints, key=natural_sort_key)

if not checkpoints:
    print(f"No checkpoints found in {model_path}")
    

for checkpoint in checkpoints:
    checkpoint_name = os.path.basename(checkpoint) 
    print(os.path.join(model_path, checkpoint_name))

../outputs/exp-1.11/checkpoints/checkpoint-85
../outputs/exp-1.11/checkpoints/checkpoint-170
../outputs/exp-1.11/checkpoints/checkpoint-255
../outputs/exp-1.11/checkpoints/checkpoint-340
../outputs/exp-1.11/checkpoints/checkpoint-425


In [5]:
for d in os.listdir(model_path):
    print(d)

checkpoint-1372
checkpoint-1029
checkpoint-1710
checkpoint-686
checkpoint-343
