# Cloud LLM Inference Benchmark - Results Visualization

**Course:** MSML 650 - Cloud Computing  
**Project:** Cloud Deployment of Model Serving Platforms: Benchmarking vLLM and SGLang

This notebook visualizes and analyzes the benchmark results comparing vLLM and SGLang inference frameworks.


## 1. Setup and Imports


In [None]:
import json
import os
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")


## 2. Load Benchmark Results


In [None]:
# Define results directory
RESULTS_DIR = Path("../results")

def load_results(framework: str) -> list:
    """Load all benchmark results for a framework"""
    results = []
    framework_dir = RESULTS_DIR / framework
    
    if framework_dir.exists():
        for result_file in framework_dir.glob("*.json"):
            try:
                with open(result_file, 'r') as f:
                    data = json.load(f)
                    data['source_file'] = result_file.name
                    results.append(data)
            except Exception as e:
                print(f"Error loading {result_file}: {e}")
    
    return results

# Load results
vllm_results = load_results('vllm')
sglang_results = load_results('sglang')

print(f"Loaded {len(vllm_results)} vLLM results")
print(f"Loaded {len(sglang_results)} SGLang results")


In [None]:
# Convert to DataFrames
def results_to_dataframe(results: list, framework: str) -> pd.DataFrame:
    """Convert results list to DataFrame"""
    if not results:
        return pd.DataFrame()
    
    df_data = []
    for r in results:
        df_data.append({
            'framework': framework,
            'workload': r.get('workload_name', 'unknown'),
            'throughput_tps': r.get('throughput_tps', 0),
            'throughput_rps': r.get('throughput_rps', 0),
            'avg_latency': r.get('avg_latency', 0),
            'min_latency': r.get('min_latency', 0),
            'max_latency': r.get('max_latency', 0),
            'p50_latency': r.get('p50_latency', 0),
            'p90_latency': r.get('p90_latency', 0),
            'p95_latency': r.get('p95_latency', 0),
            'p99_latency': r.get('p99_latency', 0),
            'total_requests': r.get('total_requests', 0),
            'successful_requests': r.get('successful_requests', 0),
            'failed_requests': r.get('failed_requests', 0),
            'total_time': r.get('total_time', 0),
            'total_output_tokens': r.get('total_output_tokens', 0),
            'timestamp': r.get('timestamp', '')
        })
    
    return pd.DataFrame(df_data)

# Create DataFrames
df_vllm = results_to_dataframe(vllm_results, 'vLLM')
df_sglang = results_to_dataframe(sglang_results, 'SGLang')

# Combine
df_all = pd.concat([df_vllm, df_sglang], ignore_index=True)

# Calculate success rate
if not df_all.empty:
    df_all['success_rate'] = (df_all['successful_requests'] / df_all['total_requests'] * 100).fillna(0)

print("Data loaded and processed!")
df_all.head() if not df_all.empty else print("No data available yet - run benchmarks first!")


In [None]:
# Bar chart comparing throughput
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if not df_all.empty:
    # Throughput TPS
    ax1 = sns.barplot(data=df_all, x='framework', y='throughput_tps', ax=axes[0], palette='Set2')
    axes[0].set_title('Average Throughput (Tokens per Second)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Framework')
    axes[0].set_ylabel('Tokens/Second')
    
    # Add value labels
    for p in ax1.patches:
        ax1.annotate(f'{p.get_height():.2f}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='bottom', fontsize=12)

    # Throughput RPS
    ax2 = sns.barplot(data=df_all, x='framework', y='throughput_rps', ax=axes[1], palette='Set2')
    axes[1].set_title('Average Throughput (Requests per Second)', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Framework')
    axes[1].set_ylabel('Requests/Second')
    
    for p in ax2.patches:
        ax2.annotate(f'{p.get_height():.2f}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='bottom', fontsize=12)

    plt.tight_layout()
    plt.savefig('../results/throughput_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("Throughput comparison saved to results/throughput_comparison.png")
else:
    print("No data available - run benchmarks first!")


## 4. Latency Distribution


## 5. Cost Efficiency Analysis


In [None]:
# Define GPU instance costs (USD per hour)
GPU_COSTS = {
    'g4dn.xlarge': 0.526,   # NVIDIA T4
    'g5.xlarge': 1.006,     # NVIDIA A10G
    'g5.2xlarge': 1.212,    # NVIDIA A10G
    'p3.2xlarge': 3.06,     # NVIDIA V100
    'p4d.24xlarge': 32.77,  # NVIDIA A100
}

# Calculate cost per million tokens
def calculate_cost_per_million_tokens(throughput_tps, hourly_cost):
    """Calculate cost per million tokens generated"""
    if throughput_tps <= 0:
        return float('inf')
    tokens_per_hour = throughput_tps * 3600
    cost_per_token = hourly_cost / tokens_per_hour
    cost_per_million = cost_per_token * 1_000_000
    return cost_per_million

# Calculate for g5.xlarge (common choice)
instance_type = 'g5.xlarge'
hourly_cost = GPU_COSTS[instance_type]

if not df_all.empty:
    cost_data = df_all.groupby('framework')['throughput_tps'].mean().reset_index()
    cost_data['cost_per_million_tokens'] = cost_data['throughput_tps'].apply(
        lambda x: calculate_cost_per_million_tokens(x, hourly_cost)
    )
    cost_data['instance_type'] = instance_type
    cost_data['hourly_cost'] = hourly_cost
    
    print(f"\nCost Analysis ({instance_type} @ ${hourly_cost}/hr):")
    print(cost_data[['framework', 'throughput_tps', 'cost_per_million_tokens']].to_string(index=False))
    
    # Visualize
    fig, ax = plt.subplots(figsize=(8, 5))
    bars = ax.bar(cost_data['framework'], cost_data['cost_per_million_tokens'], color=['#2ecc71', '#3498db'])
    ax.set_title(f'Cost per Million Tokens ({instance_type})', fontsize=14, fontweight='bold')
    ax.set_xlabel('Framework')
    ax.set_ylabel('Cost (USD)')
    
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'${height:.2f}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3), textcoords="offset points",
                   ha='center', va='bottom', fontsize=12)
    
    plt.tight_layout()
    plt.savefig('../results/cost_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No data available - run benchmarks first!")


## 6. Summary Report


In [None]:
# Generate summary report
if not df_all.empty:
    print("="*60)
    print("BENCHMARK SUMMARY REPORT")
    print("="*60)
    
    for framework in df_all['framework'].unique():
        fw_data = df_all[df_all['framework'] == framework]
        
        print(f"\n{framework}")
        print("-"*40)
        print(f"  Number of benchmarks: {len(fw_data)}")
        print(f"  Avg Throughput (TPS): {fw_data['throughput_tps'].mean():.2f}")
        print(f"  Avg Latency: {fw_data['avg_latency'].mean():.3f}s")
        print(f"  P95 Latency: {fw_data['p95_latency'].mean():.3f}s")
        print(f"  Avg Success Rate: {fw_data['success_rate'].mean():.1f}%")
        print(f"  Total Tokens Generated: {fw_data['total_output_tokens'].sum():,}")
    
    # Determine winner
    if len(df_all['framework'].unique()) == 2:
        print("\n" + "="*60)
        print("CONCLUSION")
        print("="*60)
        
        vllm_tps = df_all[df_all['framework'] == 'vLLM']['throughput_tps'].mean()
        sglang_tps = df_all[df_all['framework'] == 'SGLang']['throughput_tps'].mean()
        
        if vllm_tps > sglang_tps:
            diff = ((vllm_tps - sglang_tps) / sglang_tps) * 100 if sglang_tps > 0 else 0
            print(f"\n✓ vLLM outperforms SGLang by {diff:.1f}% in throughput")
        else:
            diff = ((sglang_tps - vllm_tps) / vllm_tps) * 100 if vllm_tps > 0 else 0
            print(f"\n✓ SGLang outperforms vLLM by {diff:.1f}% in throughput")
    
    # Save to CSV
    df_all.to_csv('../results/benchmark_results_combined.csv', index=False)
    print("\nAll results saved to results/benchmark_results_combined.csv")
else:
    print("No benchmark data available yet. Run benchmarks first!")


In [None]:
# Latency percentiles comparison
if not df_all.empty:
    latency_cols = ['avg_latency', 'p50_latency', 'p90_latency', 'p95_latency', 'p99_latency']
    latency_data = df_all.groupby('framework')[latency_cols].mean().T
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    x = np.arange(len(latency_cols))
    width = 0.35
    
    if 'vLLM' in latency_data.columns:
        bars1 = ax.bar(x - width/2, latency_data['vLLM'], width, label='vLLM', color='#2ecc71')
    if 'SGLang' in latency_data.columns:
        bars2 = ax.bar(x + width/2, latency_data['SGLang'], width, label='SGLang', color='#3498db')
    
    ax.set_xlabel('Latency Metric')
    ax.set_ylabel('Latency (seconds)')
    ax.set_title('Latency Distribution Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(['Average', 'P50', 'P90', 'P95', 'P99'])
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../results/latency_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("Latency distribution saved to results/latency_distribution.png")
else:
    print("No data available - run benchmarks first!")
