# LLM OptiBench: MLflow Analysis

This notebook connects to the local MLflow tracking server to analyze and visualize the trade-offs between different LLM optimization techniques (Baseline, Quantization, Pruning).

In [1]:
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as seaborn

# Set aesthetic style for plots
plt.style.use('ggplot')
seaborn.set_theme(style="whitegrid")

# Connect to MLflow tracking URI
mlflow.set_tracking_uri("../mlruns")

print(f"Connected to MLflow tracking URI: {mlflow.get_tracking_uri()}")

Connected to MLflow tracking URI: ../mlruns


## 1. Fetch Experiment Data

In [2]:
experiment_name = "LLM_OptiBench"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    print(f"Experiment '{experiment_name}' not found. Please run the main.py pipeline first.")
else:
    # Fetch all nested runs (where metrics are logged)
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    
    # Filter out parent runs
    df = runs[runs["tags.mlflow.parentRunId"].notna() | runs["tags.mlflow.runName"].isin(["Baseline_FP16", "Quantized_4bit_NF4", "Pruned_Unstructured"])].copy()
    
    # Clean column names for easier access
    df = df.rename(columns={
        "tags.mlflow.runName": "Method",
        "metrics.f1_score": "F1 Score",
        "metrics.exact_match": "Exact Match",
        "metrics.avg_latency": "Latency (tok/s)",
        "metrics.peak_vram_gb": "Peak VRAM (GB)",
        "metrics.model_size_gb": "Model Size (GB)",
        "tags.mlflow.parentRunId": "ParentRun"
    })
    
    display(df[["Method", "F1 Score", "Exact Match", "Latency (tok/s)", "Peak VRAM (GB)", "Model Size (GB)"]].head())

Experiment 'LLM_OptiBench' not found. Please run the main.py pipeline first.


  return FileStore(store_uri, store_uri)


## 2. Model Performance (F1 & Exact Match)
Comparing the baseline model against the optimized versions to see how much accuracy is lost.

In [None]:
if 'df' in locals() and not df.empty:
    plt.figure(figsize=(10, 6))
    
    # Melting for grouped bar chart
    melted_df = df.melt(id_vars=["Method"], value_vars=["F1 Score", "Exact Match"], 
                        var_name="Metric", value_name="Score")
    
    ax = seaborn.barplot(data=melted_df, x="Method", y="Score", hue="Metric", palette="viridis")
    plt.title("Accuracy Metrics by Optimization Method")
    plt.ylabel("Score (Percentage)")
    plt.xlabel("")
    plt.xticks(rotation=45)
    
    for i in ax.containers:
        ax.bar_label(i, fmt='%.1f', padding=3)
        
    plt.tight_layout()
    plt.show()

## 3. Resource Usage (Peak VRAM & Latency)
Visualizing the computational benefits of each technique.

In [None]:
if 'df' in locals() and not df.empty:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # VRAM Chart
    seaborn.barplot(data=df, x="Method", y="Peak VRAM (GB)", ax=ax1, palette="magma")
    ax1.set_title("Peak VRAM Usage (Lower is Better)")
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)
    for i in ax1.containers:
        ax1.bar_label(i, fmt='%.2f GB', padding=3)
        
    # Latency Chart
    seaborn.barplot(data=df, x="Method", y="Latency (tok/s)", ax=ax2, palette="crest")
    ax2.set_title("Inference Latency (Higher is Better)")
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
    for i in ax2.containers:
        ax2.bar_label(i, fmt='%.1f toks/s', padding=3)
        
    plt.tight_layout()
    plt.show()

## 4. Trade-off Analysis (Scatter Plot)
The ultimate goal is finding the Pareto frontier. We want High F1, Low VRAM, and High Latency.

In [None]:
if 'df' in locals() and not df.empty:
    plt.figure(figsize=(10, 7))
    
    # Scatter Plot: X=Latency, Y=F1 Score, Size=VRAM
    scatter = seaborn.scatterplot(data=df, x="Latency (tok/s)", y="F1 Score", 
                                  hue="Method", size="Peak VRAM (GB)", sizes=(100, 1000), 
                                  alpha=0.7, palette="Set1")
    
    plt.title("Trade-off: F1 Score vs Latency (Bubble Size = VRAM)")
    
    # Move legend outside
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Annotate points
    for idx, row in df.iterrows():
        if pd.notna(row["F1 Score"]) and pd.notna(row["Latency (tok/s)"]):
            plt.annotate(row["Method"], 
                         (row["Latency (tok/s)"], row["F1 Score"]),
                         xytext=(10, -10), textcoords='offset points')
            
    plt.grid(True)
    plt.tight_layout()
    plt.show()