In [None]:
import asyncio

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM


def call_vllm_api(
    engine: AsyncLLM, 
    prompt: str, 
    request_id: str,
    verbose: bool = True,
) -> str | None:

    sampling_params = SamplingParams(
                temperature=0.1,              # randomness of the sampling
                min_p=0.05,
                top_p=0.9,
                skip_special_tokens=True,     # Whether to skip special tokens in the output
                max_tokens=32768 ,
                # stop=["</think>"]
            )
    try:
        async for output in engine.generate(
            request_id=request_id, prompt=prompt, sampling_params=sampling_params
        ):
            # Process each completion in the output
            for completion in output.outputs:
                # In DELTA mode, we get only new tokens generated since last iteration
                new_text = completion.text
                if new_text:
                    print(new_text, end="", flush=True)

    except Exception as e:
        print(f"\n‚ùå Error during streaming: {e}")
        raise
    
