<a href="https://colab.research.google.com/github/Midhilesh4890/vLLM-Vs-Standard-Transformers/blob/main/vLLM_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install vllm transformers accelerate sentencepiece



In [None]:
import os
import time
import json
import textwrap
import torch
from typing import List, Dict, Any
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer, AutoModelForCausalLM

# Global configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
SYSTEM_PROMPT = "<|system|>\nYou are a helpful AI assistant.</s>\n<|user|>\n"

def format_prompt(human_prompt: str) -> str:
    """Format prompt for TinyLlama model"""
    return f"{SYSTEM_PROMPT}{human_prompt}</s>\n<|assistant|>\n"

def setup_vllm_model(model_name: str = MODEL_NAME):
    """Initialize vLLM model"""
    print(f"Loading vLLM model: {model_name}")
    llm = LLM(model=model_name)

    sampling_params = SamplingParams(
        max_tokens=512,
        temperature=0.7,
        top_p=0.95,
        top_k=50
    )

    return llm, sampling_params

INFO 08-25 21:51:55 [__init__.py:241] Automatically detected platform cuda.


In [None]:
def setup_transformers_model(model_name: str = MODEL_NAME):
    """Initialize standard transformers model"""
    print(f"Loading Transformers model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [None]:
def generate_vllm(llm, sampling_params, prompt: str) -> str:
    """Generate text using vLLM"""
    formatted_prompt = format_prompt(prompt)
    outputs = llm.generate([formatted_prompt], sampling_params)
    return outputs[0].outputs[0].text

In [None]:
def generate_transformers(model, tokenizer, prompt: str) -> str:
    """Generate text using transformers"""
    formatted_prompt = format_prompt(prompt)
    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            top_k=50,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    generated_tokens = outputs[0][inputs.input_ids.shape[1]:]
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

In [None]:
def benchmark_model(generate_func, prompts: List[str], model_name: str) -> Dict[str, Any]:
    """Benchmark a model with given prompts"""
    print(f"\n{'='*50}")
    print(f"Benchmarking {model_name}")
    print(f"{'='*50}")

    results = []
    total_start_time = time.time()

    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")
        print("-" * 50)

        start_time = time.time()
        response = generate_func(prompt)
        end_time = time.time()

        generation_time = end_time - start_time

        wrapped_response = textwrap.fill(response.strip(), width=80)
        print(f"Response: {wrapped_response}")
        print(f"Generation time: {generation_time:.2f} seconds")

        results.append({
            'prompt': prompt,
            'response': response,
            'time': generation_time
        })

    total_time = time.time() - total_start_time
    avg_time = sum(r['time'] for r in results) / len(results)

    print(f"\nSummary for {model_name}:")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"Average time per prompt: {avg_time:.2f} seconds")

    return {
        'model_name': model_name,
        'results': results,
        'total_time': total_time,
        'avg_time': avg_time
    }

In [None]:
def compare_models():
    """Main function to compare vLLM vs Transformers performance"""

    test_prompts = [
        "What are the differences between alpacas, vicunas and llamas?",
        "What is the capital of England?",
        "Tell me about Homer from The Simpsons",
        "Answer step by step: If a cafeteria had 23 apples, used 20 for lunch, and bought 6 more, how many apples do they have?",
        "Write a short plan for a 3-day trip to London"
    ]

    try:
        # Setup and benchmark vLLM
        print("Setting up vLLM...")
        llm, sampling_params = setup_vllm_model()

        def vllm_generate(prompt):
            return generate_vllm(llm, sampling_params, prompt)

        vllm_results = benchmark_model(vllm_generate, test_prompts, "vLLM")

        # Setup and benchmark Transformers
        print("\nSetting up Transformers...")
        hf_model, tokenizer = setup_transformers_model()

        def transformers_generate(prompt):
            return generate_transformers(hf_model, tokenizer, prompt)

        transformers_results = benchmark_model(transformers_generate, test_prompts, "Transformers")

        # Performance comparison
        print(f"\n{'='*60}")
        print("PERFORMANCE COMPARISON")
        print(f"{'='*60}")

        speedup = transformers_results['avg_time'] / vllm_results['avg_time']

        print(f"vLLM average time:         {vllm_results['avg_time']:.2f} seconds")
        print(f"Transformers average time: {transformers_results['avg_time']:.2f} seconds")
        print(f"vLLM speedup:              {speedup:.2f}x faster")

        print(f"\nvLLM total time:           {vllm_results['total_time']:.2f} seconds")
        print(f"Transformers total time:   {transformers_results['total_time']:.2f} seconds")

        # Detailed comparison per prompt
        print(f"\nDETAILED COMPARISON:")
        print(f"{'Prompt':<50} {'vLLM (s)':<10} {'HF (s)':<10} {'Speedup':<10}")
        print("-" * 80)

        for i, prompt in enumerate(test_prompts):
            vllm_time = vllm_results['results'][i]['time']
            hf_time = transformers_results['results'][i]['time']
            prompt_speedup = hf_time / vllm_time

            short_prompt = prompt[:47] + "..." if len(prompt) > 50 else prompt
            print(f"{short_prompt:<50} {vllm_time:<10.2f} {hf_time:<10.2f} {prompt_speedup:<10.2f}x")

        return vllm_results, transformers_results

    except Exception as e:
        print(f"Error during comparison: {e}")
        return None, None

def quick_vllm_test():
    """Quick test function for vLLM only"""
    print("Quick vLLM Test")
    print("="*30)

    llm, sampling_params = setup_vllm_model()

    test_prompts = [
        "Hello, how are you?",
        "What is the capital of France?",
        "Explain quantum computing in simple terms"
    ]

    for prompt in test_prompts:
        print(f"\nPrompt: {prompt}")
        print("-" * 40)

        start_time = time.time()
        response = generate_vllm(llm, sampling_params, prompt)
        end_time = time.time()

        wrapped_response = textwrap.fill(response.strip(), width=80)
        print(f"Response: {wrapped_response}")
        print(f"Time: {end_time - start_time:.2f} seconds")



In [None]:

# Run full comparison (comment out if you only want vLLM)
vllm_results, hf_results = compare_models()

# Or run quick vLLM test only
# quick_vllm_test()

Setting up vLLM...
Loading vLLM model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
INFO 08-25 21:51:56 [utils.py:326] non-default args: {'model': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'disable_log_stats': True}
INFO 08-25 21:52:13 [__init__.py:711] Resolved architecture: LlamaForCausalLM
INFO 08-25 21:52:13 [__init__.py:1750] Using max model len 2048
INFO 08-25 21:52:14 [llm_engine.py:222] Initializing a V0 LLM engine (v0.10.1.1) with config: model='TinyLlama/TinyLlama-1.1B-Chat-v1.0', speculative_config=None, tokenizer='TinyLlama/TinyLlama-1.1B-Chat-v1.0', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=F

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-25 21:52:22 [default_loader.py:262] Loading weights took 3.04 seconds
INFO 08-25 21:52:22 [model_runner.py:1112] Model loading took 2.0513 GiB and 4.245603 seconds
INFO 08-25 21:52:24 [worker.py:295] Memory profiling takes 0.96 seconds
INFO 08-25 21:52:24 [worker.py:295] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 08-25 21:52:24 [worker.py:295] model weights take 2.05GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 0.31GiB; the rest of the memory reserved for KV Cache is 10.86GiB.
INFO 08-25 21:52:24 [executor_base.py:114] # cuda blocks: 32357, # CPU blocks: 11915
INFO 08-25 21:52:24 [executor_base.py:119] Maximum concurrency for 2048 tokens per request: 252.79x
INFO 08-25 21:52:27 [model_runner.py:1383] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' i

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 08-25 21:53:00 [model_runner.py:1535] Graph capturing finished in 33 secs, took 0.13 GiB
INFO 08-25 21:53:00 [llm_engine.py:417] init engine (profile, create kv cache, warmup model) took 37.61 seconds
INFO 08-25 21:53:01 [llm.py:298] Supported_tasks: ['generate']

Benchmarking vLLM

Prompt 1: What are the differences between alpacas, vicunas and llamas?
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Response: Alpacas and vicunas are two types of domesticated animals commonly used for
textile production, while llamas are a species of wild camelid that is not
commonly used for textile production. Here are some differences between alpacas,
vicunas, and llamas:  1. Breeding: Alpacas and vicunas are both bred by humans,
but the two species differ in their breeding practices. Alpacas are primarily
bred for their wool, while vicunas are bred for their meat and wool.  2. Feed:
Alpacas and vicunas both eat a grain-based diet, but their feeding habits
differ. Alpacas feed on a variety of grains, including barley, corn, and wheat,
while vicunas feed on a mixture of grasses, herbs, and other plant materials.
3. Growth rate: Alpacas and vicunas grow at different rates. Alpacas typically
grow more slowly than vicunas, taking around 20 months to reach full maturity.
4. Colors: Alpacas and vicunas can come in a range of colors, including white,
brown, black, and gray. Vicunas are not typically wh

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Response: The capital of England is London.
Generation time: 0.17 seconds

Prompt 3: Tell me about Homer from The Simpsons
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Response: Homer Simpson is a fictional character in the animated sitcom The Simpsons
created by Matt Groening. He is the patriarch of the Simpson family, who is
married to Marge. Homer is a loving and carefree person who is always looking
for a fun and exciting adventure. He is known for his quick wit, sarcastic
humor, and love for food. Homer's most famous quotes include "You're a good man,
Homer," "I love my wife and I'm not ashamed to say I love her," and "I like to
think I'm a good person, but then I think about my stomach, and I change my
mind." Homer is also a fan of the Simpsons show, which he watches religiously
every week.
Generation time: 2.78 seconds

Prompt 4: Answer step by step: If a cafeteria had 23 apples, used 20 for lunch, and bought 6 more, how many apples do they have?
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Response: To answer this question, you need to first find the number of apples that were
used in the lunch:  - 20 apples were used for lunch - 6 apples were used for 6
more apples, resulting in 12 apples  So, the total number of apples in the
cafeteria after using 20 for lunch and buying 6 more is:  - 20 apples - 12
apples  Therefore, the total number of apples in the cafeteria is:  - 23 apples
The final answer is: 23 apples.
Generation time: 2.13 seconds

Prompt 5: Write a short plan for a 3-day trip to London
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Response: Day 1: Arrival in London  Welcome to London, the capital city of England. After
landing at the Heathrow Airport, you will be picked up by a chauffeur who will
take you to your hotel for the night.  Day 2: Exploring London  After breakfast,
you will check into your hotel and spend the day exploring London. You can take
a stroll through the famous streets of the city, visit iconic landmarks like the
London Eye, Buckingham Palace, and the Tower of London, and explore the local
markets and shops. You may also like to take a tour of the city's famous
landmarks like the Tower Bridge, Big Ben, or the London Eye.  Day 3: Shopping in
London  After a full day of sightseeing, you will be ready to shop in the city.
You can visit some of the popular shopping areas, such as Oxford Street, Regent
Street, or Carnaby Street. You can also try local specialties like fish and
chips, poutine, or sushi.  Day 4: Afternoon tea at the iconic Harrods  If you
still have some energy left, you can spend 




Benchmarking Transformers

Prompt 1: What are the differences between alpacas, vicunas and llamas?
--------------------------------------------------
Response: Alpacas, vicunas, and llamas are all types of llamas that come from different
regions and breeds. Here are some differences between them:  1. Breeds: Alpacas
are the oldest and most widely known variety of llamas. They are known for their
gentle and soft coat, which is made up of fleece that is soft, warm, and very
light. Vicunas are a breed of llama that originates from the Andes Mountains of
Peru. They have a thick, coarse, and woolly coat that is a combination of brown,
black, and white. Llamas are also known for their warm, soft coats, but they
come in different colors, including white, brown, black, and gray.  2. Type:
Alpacas are known for their long, soft, and fluffy coats. They are used for
their fiber, which is used to make yarn, fabric, and blankets. Vicunas, on the
other hand, are used primarily for their wool, which