In [24]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
from datasets import load_from_disk
from vllm.inputs import TokensPrompt
from vllm import LLM, SamplingParams
from tqdm import tqdm
from small_llm_reasoning.evaluation.gsm8k import get_score, eight_shot_messages
# from small_llm_reasoning.generation.vllm_generation import llama_forward


cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/Bidirectional-Teacher-Student-Communication-for-LLM-Alignment/transformers_cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir


In [27]:
# Loading data
data = load_from_disk("../datasets/gsm8k/test/")
data

Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})

In [3]:
# Loading model
hf_token = os.getenv("hf_token")

model_name= "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id



In [26]:
def get_prompt(ex, few_shot):
    prompt = [
        {
            'role': 'user',
            'content': f'Given the following problem, reason and give a final answer to the problem.\nProblem: {ex["question"][i]}\nYour response should end with "The final answer is [answer]" where [answer] is the response to the problem.\n'
        }
    ]
    if few_shot:

def tokenize_function(examples):
    
    return tokenizer(examples["question"], add_special_tokens=False)

tokenized_dataset = data.map(tokenize_function, batched=False)


100%|██████████| 10/10 [00:00<00:00, 75437.12it/s]


In [20]:
tokenized_output = tokenizer(['what is the capital of India?','Where is Delhi'],  return_tensors='pt', add_special_tokens=False, padding=True)
tokenized_output
# tokenized_output['input_ids'].tolist()

{'input_ids': tensor([[ 12840,    374,    279,   6864,    315,   6890,     30],
        [  9241,    374,  22767, 128009, 128009, 128009, 128009]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0, 0]])}

In [21]:
# Create a TokensPrompt instance
tokenized_prompts=[]
for input_ids in tokenized_output['input_ids']:
    tokenized_prompt = TokensPrompt(prompt_token_ids=input_ids.tolist())
    tokenized_prompts.append(tokenized_prompt)
tokenized_prompts

[{'prompt_token_ids': [12840, 374, 279, 6864, 315, 6890, 30]},
 {'prompt_token_ids': [9241, 374, 22767, 128009, 128009, 128009, 128009]}]

In [6]:
# stop_strings =  ['<|eot_id|>', '<|start_header_id|>user<|end_header_id|>', 'Q:', '</s>', '<|im_end|>']

sampling_params = SamplingParams(n=1,
                                 temperature=0,
                                 max_tokens=500,
                                 # stop=stop_strings,
                                 seed=1)

model = LLM(
        model=model_name, 
        # tokenizer=model_name, 
        tensor_parallel_size=1) 



INFO 02-24 16:05:54 config.py:999] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 02-24 16:05:54 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='meta-llama/Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.29it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.27it/s]


INFO 02-24 16:05:57 model_runner.py:926] Loading model weights took 2.3185 GB





INFO 02-24 16:05:58 gpu_executor.py:122] # GPU blocks: 74384, # CPU blocks: 8192
INFO 02-24 16:05:59 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-24 16:05:59 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-24 16:06:08 model_runner.py:1335] Graph capturing finished in 9 secs.


In [22]:
outputs = model.generate(tokenized_prompts, sampling_params)
# outputs = model.generate(['what is the capital of India','Where is Delhi'], sampling_params)

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it, est. speed input: 6.21 toks/s, output: 222.10 toks/s]


In [30]:
for out in outputs:
    print(out)

RequestOutput(request_id=6, prompt=None, prompt_token_ids=[12840, 374, 279, 6864, 315, 6890, 30], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' New Delhi. The capital of India is New Delhi. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New Delhi is the capital of India. New D

In [11]:
# all_outputs = llama_forward(
#         prompts=tokenized_output, 
#         model_path=model_name, 
#         max_tokens=256, 
#         temperature=0, 
#         n_samples=1,
#         n_gpus=1
#     )

NameError: name 'llama_forward' is not defined

In [25]:
for ex_outputs in all_outputs:
    print(ex_outputs.prompt)
    print(ex_outputs.outputs[0].text)

None
Janet has 16 eggs per day. She eats 3 for breakfast, so she has 16 - 3 = 13 eggs left. She bakes muffins for 4, so she has 13 - 4 = 9 eggs left. She sells 9 eggs at $2 each, so she makes 9 x 2 = 18 dollars per day. The final answer is 18


In [1]:
from vllm import LLM, SamplingParams
from vllm.inputs import TokensPrompt

# Initialize the LLM
llm = LLM(model="facebook/opt-125m")

# Get the tokenizer
tokenizer = llm.get_tokenizer()

# Define your input strings
input_strings = [
    "Translate the following English text to French: 'Hello, how are you?'",
    "What is the capital of France?",
    "Explain the concept of machine learning in simple terms."
]

# Tokenize the input strings and create TokensPrompt instances
tokenized_prompts = [
    TokensPrompt(prompt_token_ids=tokenizer.encode(input_string))
    for input_string in input_strings
]

# Define sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)

# Generate the outputs for the batch
outputs = llm.generate(tokenized_prompts, sampling_params)

# Print the results
for input_string, output in zip(input_strings, outputs):
    generated_text = output.outputs[0].text
    print(f"Input: {input_string}")
    print(f"Generated text: {generated_text}")
    print("---")


  from .autonotebook import tqdm as notebook_tqdm
2025-02-24 16:02:37,552	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 02-24 16:02:38 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=facebook/opt-125m, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_caching=False, use_async_output_proc=True)
INFO 02-24 1

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
  state = torch.load(bin_file, map_location="cpu")
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.05it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.01it/s]



INFO 02-24 16:02:40 model_runner.py:926] Loading model weights took 0.2389 GB
INFO 02-24 16:02:40 gpu_executor.py:122] # GPU blocks: 71292, # CPU blocks: 7281
INFO 02-24 16:02:42 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-24 16:02:42 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-24 16:02:49 model_runner.py:1335] Graph capturing finished in 7 secs.


Processed prompts: 100%|██████████| 3/3 [00:00<00:00, 21.75it/s, est. speed input: 268.33 toks/s, output: 1087.78 toks/s]

Input: Translate the following English text to French: 'Hello, how are you?'
Generated text: 

It is important to note that we speak French with two genders: on the one hand you have the masculine (Latin for "fellow", and the feminine "other") and on the other you have the feminine "speaker". Therefore,
---
Input: What is the capital of France?
Generated text: 
The capital of France. It's the second largest country after Belgium and is the second biggest economy in the world. It's also the third largest nation in the world and it's the third largest country in the world.
> it's the
---
Input: Explain the concept of machine learning in simple terms.
Generated text: 

In the past, a scientific exploration of machine learning used to be regarded as an antiquated notion. It was not the first time that a researcher had to explain how an algorithm could be used to generate an algorithm.

However, in
---





In [2]:
tokenized_prompts

[{'prompt_token_ids': [2,
   19163,
   19593,
   5,
   511,
   2370,
   2788,
   7,
   1515,
   35,
   128,
   31414,
   6,
   141,
   32,
   47,
   6600]},
 {'prompt_token_ids': [2, 2264, 16, 5, 812, 9, 1470, 116]},
 {'prompt_token_ids': [2,
   43043,
   1851,
   5,
   4286,
   9,
   3563,
   2239,
   11,
   2007,
   1110,
   4]}]