In [17]:
from pathlib import Path

import pandas as pd
from argparse import Namespace
import matplotlib.pyplot as plt

# Get all files with format '*.latency.csv' from root_dir
# root_dir = Path("fig11-abalation-log")
root_dir = Path("result")
latency_file_paths = sorted(list(root_dir.glob("*.latency.csv")))
experiment_log_paths = sorted(list(root_dir.glob("*.log")))
columns = ['backend', 'rate', 'target', 'attainment', 'latency']

In [18]:
dfs = []
namespaces = []
for latency_file_path, experiment_log_path in zip(latency_file_paths, experiment_log_paths):
    # read experiment_log_path and log the namespace
    with open(experiment_log_path, 'r') as f:
        exp_args = f.read()
        exp_args = eval(exp_args)
        namespaces.append(exp_args)

    df = pd.read_csv(latency_file_path)
    dfs.append(df)

In [19]:
big_df = pd.concat(dfs, ignore_index=True)
big_df['ngpu'] = big_df['tp_prefill'] * big_df['pp_prefill'] + big_df['tp_decode'] * big_df['pp_decode']
big_df['per_gpu_rate'] = big_df['rate'] / big_df['ngpu']
big_df['goodput@90'] = big_df.apply(
    lambda x: x['rate'] / x['ngpu'] if x['attainment'] >= 90 else 0,
    axis=1,
)

In [20]:
big_df

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90
0,distserve,OPT-13B,prefill,10.0,200.0,31.9,1,1,1,1,2,5.0,0.0
1,distserve,OPT-13B,decode,10.0,100.0,31.6,1,1,1,1,2,5.0,0.0
2,distserve,OPT-13B,both,10.0,"(200.0, 100.0)",10.3,1,1,1,1,2,5.0,0.0
3,distserve,OPT-13B,prefill,10.0,80.0,9.4,1,1,1,1,2,5.0,0.0
4,distserve,OPT-13B,decode,10.0,40.0,5.5,1,1,1,1,2,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19370,vllm,OPT-13B,decode,96.0,281.2603233009907,95.0,4,8,0,0,32,3.0,3.0
19371,vllm,OPT-13B,prefill,96.0,289.6116463515349,98.0,4,8,0,0,32,3.0,3.0
19372,vllm,OPT-13B,decode,96.0,311.0419295211507,98.0,4,8,0,0,32,3.0,3.0
19373,vllm,OPT-13B,prefill,96.0,309.9117983314648,99.0,4,8,0,0,32,3.0,3.0


In [21]:
max_machine = 4
max_gpu_per_node = 8


def can_fit_low_affinity_distserve(x):
    a, b, c, d = x['tp_prefill'], x['pp_prefill'], x['tp_decode'], x['pp_decode']
    for pp_common in range(1, max_machine + 1):
        bp = b / pp_common
        dp = d / pp_common
        # If either bp or dp is not int, skip
        if int(bp) != bp or int(dp) != dp:
            continue
        # Check if the segment can be placed inside a node
        if a * bp + c * dp <= max_gpu_per_node:
            return True
        pass
    return False


def can_fit_low_affinity(x):
    if x['backend'] == 'distserve':
        return can_fit_low_affinity_distserve(x)
    else:
        return True
    pass


big_df['low_affin'] = big_df.apply(can_fit_low_affinity, axis=1)

In [22]:
big_df.sort_values(by=['backend', 'per_gpu_rate', 'tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode'])

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
4375,distserve,OPT-13B,prefill,2.0,200.0,91.6,1,1,1,1,2,1.0,1.0,True
4376,distserve,OPT-13B,decode,2.0,100.0,100.0,1,1,1,1,2,1.0,1.0,True
4377,distserve,OPT-13B,both,2.0,"(200.0, 100.0)",91.6,1,1,1,1,2,1.0,1.0,True
4378,distserve,OPT-13B,prefill,2.0,80.0,40.9,1,1,1,1,2,1.0,0.0,True
4379,distserve,OPT-13B,decode,2.0,40.0,99.8,1,1,1,1,2,1.0,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18270,vllm,OPT-13B,decode,160.0,650.4200146543143,95.0,4,8,0,0,32,5.0,5.0,True
18271,vllm,OPT-13B,prefill,160.0,417.6207726276993,98.0,4,8,0,0,32,5.0,5.0,True
18272,vllm,OPT-13B,decode,160.0,821.0808666193252,98.0,4,8,0,0,32,5.0,5.0,True
18273,vllm,OPT-13B,prefill,160.0,428.86314445900445,99.0,4,8,0,0,32,5.0,5.0,True


In [23]:
target = '(200.0, 100.0)'
figure_11_left_df = big_df[
    (big_df['pd'] == 'both')
    & (big_df['target'] == target)
    ].copy()

figure_11_left_df = figure_11_left_df.sort_values(by=[
    'backend', 'tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode',
    'rate'
])
# Choose the config with the best goodput in each group
figure_11_distserve_high = figure_11_left_df[
    (figure_11_left_df['backend'] == 'distserve')
]
figure_11_distserve_low = figure_11_left_df[
    (figure_11_left_df['backend'] == 'distserve')
    & (figure_11_left_df['low_affin'])
    ]
figure_11_vllm_high = figure_11_left_df[
    (figure_11_left_df['backend'] == 'vllm')
]
figure_11_vllm_low = figure_11_left_df[
    (figure_11_left_df['backend'] == 'vllm')
    & (figure_11_left_df['pp_prefill'] == 1)
    ]


In [24]:
def find_best_config(df):
    # Filter the DataFrame to include only rows where goodput@90 > 0
    filtered_df = df[df['goodput@90'] > 0]

    # Group by the specified columns and find the row with the maximum 'rate' in each group
    grouped_df = filtered_df.groupby(['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode'])
    max_rate_df = grouped_df['rate'].idxmax()

    # Retrieve the rows with the maximum 'rate' from the original DataFrame using the indices
    best_configs = df.loc[max_rate_df]

    # Return the DataFrame containing the best configurations
    # return best_configs[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']]
    return best_configs


In [25]:
figure_11_distserve_high

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
4377,distserve,OPT-13B,both,2.0,"(200.0, 100.0)",91.6,1,1,1,1,2,1.0,1.0,True
9677,distserve,OPT-13B,both,4.0,"(200.0, 100.0)",80.5,1,1,1,1,2,2.0,0.0,True
13052,distserve,OPT-13B,both,6.0,"(200.0, 100.0)",68.3,1,1,1,1,2,3.0,0.0,True
15627,distserve,OPT-13B,both,8.0,"(200.0, 100.0)",23.5,1,1,1,1,2,4.0,0.0,True
2,distserve,OPT-13B,both,10.0,"(200.0, 100.0)",10.3,1,1,1,1,2,5.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8577,distserve,OPT-13B,both,32.0,"(200.0, 100.0)",100.0,4,4,8,2,32,1.0,1.0,False
14802,distserve,OPT-13B,both,64.0,"(200.0, 100.0)",95.1,4,4,8,2,32,2.0,2.0,False
17852,distserve,OPT-13B,both,96.0,"(200.0, 100.0)",53.7,4,4,8,2,32,3.0,0.0,False
2477,distserve,OPT-13B,both,128.0,"(200.0, 100.0)",4.3,4,4,8,2,32,4.0,0.0,False


In [26]:
figure_11_distserve_low

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
4377,distserve,OPT-13B,both,2.0,"(200.0, 100.0)",91.6,1,1,1,1,2,1.0,1.0,True
9677,distserve,OPT-13B,both,4.0,"(200.0, 100.0)",80.5,1,1,1,1,2,2.0,0.0,True
13052,distserve,OPT-13B,both,6.0,"(200.0, 100.0)",68.3,1,1,1,1,2,3.0,0.0,True
15627,distserve,OPT-13B,both,8.0,"(200.0, 100.0)",23.5,1,1,1,1,2,4.0,0.0,True
2,distserve,OPT-13B,both,10.0,"(200.0, 100.0)",10.3,1,1,1,1,2,5.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,distserve,OPT-13B,both,32.0,"(200.0, 100.0)",100.0,4,4,4,4,32,1.0,1.0,True
14777,distserve,OPT-13B,both,64.0,"(200.0, 100.0)",95.1,4,4,4,4,32,2.0,2.0,True
17802,distserve,OPT-13B,both,96.0,"(200.0, 100.0)",53.7,4,4,4,4,32,3.0,0.0,True
2452,distserve,OPT-13B,both,128.0,"(200.0, 100.0)",4.3,4,4,4,4,32,4.0,0.0,True


In [27]:
figure_11_vllm_high

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
17877,vllm,OPT-13B,both,1.0,"(200.0, 100.0)",91.3,1,1,0,0,1,1.0,1.0,True
18277,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",86.0,1,1,0,0,1,2.0,0.0,True
18502,vllm,OPT-13B,both,3.0,"(200.0, 100.0)",79.3,1,1,0,0,1,3.0,0.0,True
18677,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",25.9,1,1,0,0,1,4.0,0.0,True
18952,vllm,OPT-13B,both,5.0,"(200.0, 100.0)",9.4,1,1,0,0,1,5.0,0.0,True
18302,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",92.7,1,2,0,0,2,1.0,1.0,True
18702,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",86.2,1,2,0,0,2,2.0,0.0,True
18977,vllm,OPT-13B,both,6.0,"(200.0, 100.0)",18.3,1,2,0,0,2,3.0,0.0,True
19102,vllm,OPT-13B,both,8.0,"(200.0, 100.0)",8.1,1,2,0,0,2,4.0,0.0,True
17902,vllm,OPT-13B,both,10.0,"(200.0, 100.0)",3.6,1,2,0,0,2,5.0,0.0,True


In [28]:
figure_11_vllm_low

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
17877,vllm,OPT-13B,both,1.0,"(200.0, 100.0)",91.3,1,1,0,0,1,1.0,1.0,True
18277,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",86.0,1,1,0,0,1,2.0,0.0,True
18502,vllm,OPT-13B,both,3.0,"(200.0, 100.0)",79.3,1,1,0,0,1,3.0,0.0,True
18677,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",25.9,1,1,0,0,1,4.0,0.0,True
18952,vllm,OPT-13B,both,5.0,"(200.0, 100.0)",9.4,1,1,0,0,1,5.0,0.0,True
18327,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",99.0,2,1,0,0,2,1.0,1.0,True
18752,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",95.8,2,1,0,0,2,2.0,2.0,True
19002,vllm,OPT-13B,both,6.0,"(200.0, 100.0)",87.2,2,1,0,0,2,3.0,0.0,True
19177,vllm,OPT-13B,both,8.0,"(200.0, 100.0)",25.8,2,1,0,0,2,4.0,0.0,True
17927,vllm,OPT-13B,both,10.0,"(200.0, 100.0)",11.3,2,1,0,0,2,5.0,0.0,True


In [29]:
# Plot the `figure_11_distserve_high`for some configurations
# tp_prefill = 1, pp_prefill = 1, tp_decode = 1, pp_decode = 1
# x-axis: rate
# y-axis: attainment
# find all combination of tp_prefill, pp_prefill, tp_decode, pp_decode
import plotly.graph_objects as go

fig = go.Figure()
configs = figure_11_distserve_high[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']].drop_duplicates()
df = figure_11_distserve_high

for tp_prefill, pp_prefill, tp_decode, pp_decode in configs.values:
    config_df = df[
        (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
        (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
        ]
    # plot this inside a plotly plot
    fig.add_trace(go.Scatter(
        x=config_df['per_gpu_rate'], y=config_df['attainment'],
        mode='lines+markers', name=f"p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}-distserve"
    ))

# fig add title
fig.update_layout(
    title="DistServe",
    xaxis_title="Per-GPU Rate (tokens/s)",
    yaxis_title="Attainment (%)",
    legend_title="Configuration"
)

fig.show()
# Export to html
fig.write_html("visual/figure_11_distserve_high.html")

In [30]:
# Plot the `figure_11_vllm_high`for some configurations
# tp_prefill = 1, pp_prefill = 1
# x-axis: rate
# y-axis: attainment
# find all combination of tp_prefill, pp_prefill
import plotly.graph_objects as go

fig = go.Figure()
configs = figure_11_vllm_high[['tp_prefill', 'pp_prefill']].drop_duplicates()
df = figure_11_vllm_high

for tp_prefill, pp_prefill in configs.values:
    config_df = df[
        (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
        ]
    # plot this inside a plotly plot
    fig.add_trace(go.Scatter(
        x=config_df['per_gpu_rate'], y=config_df['attainment'],
        mode='lines+markers', name=f"p{tp_prefill}{pp_prefill}-vllm"
    ))

# fig add title
fig.update_layout(
    title="vLLM++",
    xaxis_title="Per-GPU Rate (tokens/s)",
    yaxis_title="Attainment (%)",
    legend_title="Configuration"
)
fig.show()
# Export to html
fig.write_html("visual/figure_11_vllm_high.html")

In [31]:
import plotly.graph_objects as go

fig = go.Figure()

# Plot the `figure_11_distserve_high`for some configurations
# tp_prefill = 1, pp_prefill = 1, tp_decode = 1, pp_decode = 1
# x-axis: rate
# y-axis: attainment
# find all combination of tp_prefill, pp_prefill, tp_decode, pp_decode

configs = figure_11_distserve_high[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']].drop_duplicates()
df = figure_11_distserve_high

for tp_prefill, pp_prefill, tp_decode, pp_decode in configs.values:
    config_df = df[
        (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
        (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
        ]
    # plot this inside a plotly plot
    fig.add_trace(go.Scatter(
        x=config_df['per_gpu_rate'], y=config_df['attainment'],
        mode='lines+markers', name=f"p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}-distserve"
    ))


# Plot the `figure_11_vllm_high`for some configurations
# tp_prefill = 1, pp_prefill = 1
# x-axis: rate
# y-axis: attainment
# find all combination of tp_prefill, pp_prefill

configs = figure_11_vllm_high[['tp_prefill', 'pp_prefill']].drop_duplicates()
df = figure_11_vllm_high

for tp_prefill, pp_prefill in configs.values:
    config_df = df[
        (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
        ]
    # plot this inside a plotly plot
    fig.add_trace(go.Scatter(
        x=config_df['per_gpu_rate'], y=config_df['attainment'],
        mode='lines+markers', name=f"p{tp_prefill}{pp_prefill}-vllm"
    ))

# fig add title
fig.update_layout(
    title="Figure 11: Abalation Study (DistServe and vLLM)",
    xaxis_title="Per-GPU Rate (tokens/s)",
    yaxis_title="Attainment (%)",
    legend_title="Configuration"
)
fig.show()
fig.write_html("visual/figure_11.full.html")

In [43]:
import plotly.graph_objects as go

fig = go.Figure()
configs = figure_11_vllm_high[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']].drop_duplicates()

# Case 1: DistServe-High
df = figure_11_distserve_high
tp_prefill, pp_prefill, tp_decode, pp_decode = 4, 2, 1, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
    (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"disthigh-p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}"
))

# Case 2: DistServe-Low
df = figure_11_distserve_low
tp_prefill, pp_prefill, tp_decode, pp_decode = 4, 1, 1, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
    (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"distlow-p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}"
))

# Case 3: vLLM++
df = figure_11_vllm_high
tp_prefill, pp_prefill = 4, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"vllm++-p{tp_prefill}{pp_prefill}"
))

# Case 4: vLLM
df = figure_11_vllm_low
tp_prefill, pp_prefill = 4, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"vllm-p{tp_prefill}{pp_prefill}"
))


fig.update_layout(
    title="Figure 11: Abalation Study (DistServe and vLLM)<br>"
          "<sup>The figure shows that DistHigh > DistLow > vLLM++ > vLLM (vLLM++ and vLLM overlaps) </sup>",
    xaxis_title="Per-GPU Rate (tokens/s)",
    yaxis_title="Attainment (%)",
    legend_title="Configuration"
)
fig.show()
fig.write_html("visual/figure_11.html")