## Install VLLM
https://github.com/vllm-project/vllm

Known for Page Attention and KV Cache

In [1]:
!pip install vllm



In [2]:
from vllm import LLM, SamplingParams

In [3]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

In [4]:
llm = LLM(model="meta-llama/Llama-2-7b-hf",max_model_len=4096)

INFO 03-05 22:46:57 llm_engine.py:87] Initializing an LLM engine with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=128, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-05 22:47:04 weight_utils.py:163] Using model weights format ['*.safetensors']
INFO 03-05 22:48:01 llm_engine.py:357] # GPU blocks: 24, # CPU blocks: 512
INFO 03-05 22:48:04 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-05 22:48:04 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider d

In [5]:
outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 4/4 [00:01<00:00,  3.79it/s]

Prompt: 'Hello, my name is', Generated text: ' Dustin Nelson and I’m going to be your tutor!\n'
Prompt: 'The president of the United States is', Generated text: ' a member of an organization that has as one of its primary goals the overthrow'
Prompt: 'The capital of France is', Generated text: ' famous for its cafes and restaurants, and more than 20'
Prompt: 'The future of AI is', Generated text: ' a future where humans and robots work together.\nFrom playing chess and'





# Run the server
1. Restart the runtime to reset GPU memory
2. Pass in the HF_TOKEN environment variable
3. Set the max model length --max-model-len

In [None]:
%env HF_TOKEN=
!python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-hf --max-model-len 128

env: HF_TOKEN=hf_*****
INFO 03-05 23:00:52 api_server.py:228] args: Namespace(host=None, port=8000, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, served_model_name=None, lora_modules=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, root_path=None, middleware=[], model='meta-llama/Llama-2-7b-hf', tokenizer=None, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', kv_cache_dtype='auto', max_model_len=128, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, block_size=16, seed=0, swap_space=4, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, max_paddings=256, disable_log_stats=False, quantization=None, enforce_eager=False, max_context_len_to_capture=8192, disable_custom_all_reduce=False, enable_lora=F

# Execute this line in the terminal to call our VLLM server

In [None]:
%%shell
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Llama-2-7b-hf",
"prompt": "San Francisco is a",
"max_tokens": 100,
"temperature": 0
}'