In [54]:
from pathlib import Path

import pandas as pd
from argparse import Namespace
import matplotlib.pyplot as plt

# Get all files with format '*.latency.csv' from root_dir
# root_dir = Path("fig11-abalation-log")
root_dir = Path("result")
latency_file_paths = sorted(list(root_dir.glob("*.latency.csv")))
experiment_log_paths = sorted(list(root_dir.glob("*.log")))
columns = ['backend', 'rate', 'target', 'attainment', 'latency']

In [55]:
dfs = []
namespaces = []
for latency_file_path, experiment_log_path in zip(latency_file_paths, experiment_log_paths):
    # read experiment_log_path and log the namespace
    with open(experiment_log_path, 'r') as f:
        exp_args = f.read()
        exp_args = eval(exp_args)
        namespaces.append(exp_args)

    df = pd.read_csv(latency_file_path)
    dfs.append(df)

In [56]:
big_df = pd.concat(dfs, ignore_index=True)
big_df['ngpu'] = big_df['tp_prefill'] * big_df['pp_prefill'] + big_df['tp_decode'] * big_df['pp_decode']
big_df['per_gpu_rate'] = big_df['rate'] / big_df['ngpu']
big_df['goodput@90'] = big_df.apply(
    lambda x: x['rate'] / x['ngpu'] if x['attainment'] >= 90 else 0,
    axis=1,
)

In [57]:
big_df

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90
0,distserve,OPT-13B,prefill,10.0,200.0,31.0,1,1,1,1,2,5.0,0.0
1,distserve,OPT-13B,decode,10.0,100.0,32.7,1,1,1,1,2,5.0,0.0
2,distserve,OPT-13B,both,10.0,"(200.0, 100.0)",9.9,1,1,1,1,2,5.0,0.0
3,distserve,OPT-13B,prefill,10.0,80.0,10.2,1,1,1,1,2,5.0,0.0
4,distserve,OPT-13B,decode,10.0,40.0,4.2,1,1,1,1,2,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5245,vllm,OPT-13B,decode,8.0,23.63819926962662,95.0,4,1,0,0,4,2.0,2.0
5246,vllm,OPT-13B,prefill,8.0,155.3716944415064,98.0,4,1,0,0,4,2.0,2.0
5247,vllm,OPT-13B,decode,8.0,28.21016847928571,98.0,4,1,0,0,4,2.0,2.0
5248,vllm,OPT-13B,prefill,8.0,187.19097293514704,99.0,4,1,0,0,4,2.0,2.0


In [58]:
max_machine = 4
max_gpu_per_node = 8


def can_fit_low_affinity_distserve(x):
    a, b, c, d = x['tp_prefill'], x['pp_prefill'], x['tp_decode'], x['pp_decode']
    for pp_common in range(1, max_machine + 1):
        bp = b / pp_common
        dp = d / pp_common
        # If either bp or dp is not int, skip
        if int(bp) != bp or int(dp) != dp:
            continue
        # Check if the segment can be placed inside a node
        if a * bp + c * dp <= max_gpu_per_node:
            return True
        pass
    return False


def can_fit_low_affinity(x):
    if x['backend'] == 'distserve':
        return can_fit_low_affinity_distserve(x)
    else:
        return True
    pass


big_df['low_affin'] = big_df.apply(can_fit_low_affinity, axis=1)

In [59]:
big_df.sort_values(by=['backend', 'per_gpu_rate', 'tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode'])

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
1400,distserve,OPT-13B,prefill,2.0,200.0,91.1,1,1,1,1,2,1.0,1.0,True
1401,distserve,OPT-13B,decode,2.0,100.0,100.0,1,1,1,1,2,1.0,1.0,True
1402,distserve,OPT-13B,both,2.0,"(200.0, 100.0)",91.1,1,1,1,1,2,1.0,1.0,True
1403,distserve,OPT-13B,prefill,2.0,80.0,40.7,1,1,1,1,2,1.0,0.0,True
1404,distserve,OPT-13B,decode,2.0,40.0,99.4,1,1,1,1,2,1.0,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4820,vllm,OPT-13B,decode,20.0,63.2665077671464,95.0,4,1,0,0,4,5.0,5.0,True
4821,vllm,OPT-13B,prefill,20.0,301.0933323083602,98.0,4,1,0,0,4,5.0,5.0,True
4822,vllm,OPT-13B,decode,20.0,73.99361755436871,98.0,4,1,0,0,4,5.0,5.0,True
4823,vllm,OPT-13B,prefill,20.0,385.84969787297405,99.0,4,1,0,0,4,5.0,5.0,True


In [60]:
target = '(200.0, 100.0)'
figure_11_left_df = big_df[
    (big_df['pd'] == 'both')
    & (big_df['target'] == target)
    ].copy()

figure_11_left_df = figure_11_left_df.sort_values(by=[
    'backend', 'tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode',
    'rate'
])
# Choose the config with the best goodput in each group
figure_11_distserve_high = figure_11_left_df[
    (figure_11_left_df['backend'] == 'distserve')
]
figure_11_distserve_low = figure_11_left_df[
    (figure_11_left_df['backend'] == 'distserve')
    & (figure_11_left_df['low_affin'])
    ]
figure_11_vllm_high = figure_11_left_df[
    (figure_11_left_df['backend'] == 'vllm')
]
figure_11_vllm_low = figure_11_left_df[
    (figure_11_left_df['backend'] == 'vllm')
    & (figure_11_left_df['pp_prefill'] == 1)
    ]


In [61]:
def find_best_config(df):
    # Filter the DataFrame to include only rows where goodput@90 > 0
    filtered_df = df[df['goodput@90'] > 0]

    # Group by the specified columns and find the row with the maximum 'rate' in each group
    grouped_df = filtered_df.groupby(['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode'])
    max_rate_df = grouped_df['rate'].idxmax()

    # Retrieve the rows with the maximum 'rate' from the original DataFrame using the indices
    best_configs = df.loc[max_rate_df]

    # Return the DataFrame containing the best configurations
    # return best_configs[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']]
    return best_configs


In [66]:
figure_11_distserve_high

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
1402,distserve,OPT-13B,both,2.0,"(200.0, 100.0)",91.1,1,1,1,1,2,1.0,1.0,True
2952,distserve,OPT-13B,both,4.0,"(200.0, 100.0)",79.7,1,1,1,1,2,2.0,0.0,True
3652,distserve,OPT-13B,both,6.0,"(200.0, 100.0)",64.1,1,1,1,1,2,3.0,0.0,True
4102,distserve,OPT-13B,both,8.0,"(200.0, 100.0)",27.4,1,1,1,1,2,4.0,0.0,True
2,distserve,OPT-13B,both,10.0,"(200.0, 100.0)",9.9,1,1,1,1,2,5.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,distserve,OPT-13B,both,8.0,"(200.0, 100.0)",99.8,4,1,4,1,8,1.0,1.0,True
1127,distserve,OPT-13B,both,16.0,"(200.0, 100.0)",99.6,4,1,4,1,8,2.0,2.0,True
2102,distserve,OPT-13B,both,24.0,"(200.0, 100.0)",80.1,4,1,4,1,8,3.0,0.0,True
2777,distserve,OPT-13B,both,32.0,"(200.0, 100.0)",45.2,4,1,4,1,8,4.0,0.0,True


In [67]:
figure_11_distserve_low

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
1402,distserve,OPT-13B,both,2.0,"(200.0, 100.0)",91.1,1,1,1,1,2,1.0,1.0,True
2952,distserve,OPT-13B,both,4.0,"(200.0, 100.0)",79.7,1,1,1,1,2,2.0,0.0,True
3652,distserve,OPT-13B,both,6.0,"(200.0, 100.0)",64.1,1,1,1,1,2,3.0,0.0,True
4102,distserve,OPT-13B,both,8.0,"(200.0, 100.0)",27.4,1,1,1,1,2,4.0,0.0,True
2,distserve,OPT-13B,both,10.0,"(200.0, 100.0)",9.9,1,1,1,1,2,5.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,distserve,OPT-13B,both,8.0,"(200.0, 100.0)",99.8,4,1,4,1,8,1.0,1.0,True
1127,distserve,OPT-13B,both,16.0,"(200.0, 100.0)",99.6,4,1,4,1,8,2.0,2.0,True
2102,distserve,OPT-13B,both,24.0,"(200.0, 100.0)",80.1,4,1,4,1,8,3.0,0.0,True
2777,distserve,OPT-13B,both,32.0,"(200.0, 100.0)",45.2,4,1,4,1,8,4.0,0.0,True


In [68]:
figure_11_vllm_high

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
4502,vllm,OPT-13B,both,1.0,"(200.0, 100.0)",92.3,1,1,0,0,1,1.0,1.0,True
4702,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",84.3,1,1,0,0,1,2.0,0.0,True
4852,vllm,OPT-13B,both,3.0,"(200.0, 100.0)",78.1,1,1,0,0,1,3.0,0.0,True
4902,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",70.8,1,1,0,0,1,4.0,0.0,True
5052,vllm,OPT-13B,both,5.0,"(200.0, 100.0)",58.1,1,1,0,0,1,5.0,0.0,True
4727,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",89.9,1,2,0,0,2,1.0,0.0,True
4927,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",85.6,1,2,0,0,2,2.0,0.0,True
5077,vllm,OPT-13B,both,6.0,"(200.0, 100.0)",72.0,1,2,0,0,2,3.0,0.0,True
5127,vllm,OPT-13B,both,8.0,"(200.0, 100.0)",36.1,1,2,0,0,2,4.0,0.0,True
4527,vllm,OPT-13B,both,10.0,"(200.0, 100.0)",12.9,1,2,0,0,2,5.0,0.0,True


In [69]:
figure_11_vllm_low

Unnamed: 0,backend,model_type,pd,rate,target,attainment,tp_prefill,pp_prefill,tp_decode,pp_decode,ngpu,per_gpu_rate,goodput@90,low_affin
4502,vllm,OPT-13B,both,1.0,"(200.0, 100.0)",92.3,1,1,0,0,1,1.0,1.0,True
4702,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",84.3,1,1,0,0,1,2.0,0.0,True
4852,vllm,OPT-13B,both,3.0,"(200.0, 100.0)",78.1,1,1,0,0,1,3.0,0.0,True
4902,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",70.8,1,1,0,0,1,4.0,0.0,True
5052,vllm,OPT-13B,both,5.0,"(200.0, 100.0)",58.1,1,1,0,0,1,5.0,0.0,True
4752,vllm,OPT-13B,both,2.0,"(200.0, 100.0)",98.4,2,1,0,0,2,1.0,1.0,True
4952,vllm,OPT-13B,both,4.0,"(200.0, 100.0)",96.1,2,1,0,0,2,2.0,2.0,True
5102,vllm,OPT-13B,both,6.0,"(200.0, 100.0)",88.8,2,1,0,0,2,3.0,0.0,True
5177,vllm,OPT-13B,both,8.0,"(200.0, 100.0)",84.6,2,1,0,0,2,4.0,0.0,True
4552,vllm,OPT-13B,both,10.0,"(200.0, 100.0)",63.4,2,1,0,0,2,5.0,0.0,True


In [80]:
# Plot the `figure_11_distserve_high`for some configurations
# tp_prefill = 1, pp_prefill = 1, tp_decode = 1, pp_decode = 1
# x-axis: rate
# y-axis: attainment
# find all combination of tp_prefill, pp_prefill, tp_decode, pp_decode
import plotly.graph_objects as go

fig = go.Figure()
configs = figure_11_distserve_high[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']].drop_duplicates()
df = figure_11_distserve_high

for tp_prefill, pp_prefill, tp_decode, pp_decode in configs.values:
    config_df = df[
        (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
        (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
        ]
    # plot this inside a plotly plot
    fig.add_trace(go.Scatter(
        x=config_df['per_gpu_rate'], y=config_df['attainment'],
        mode='lines+markers', name=f"p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}-distserve"
    ))

# fig add title
fig.update_layout(
    title="DistServe",
    xaxis_title="Per-GPU Rate (tokens/s)",
    yaxis_title="Attainment (%)",
    legend_title="Configuration"
)

fig.show()
# Export to html
fig.write_html("visual/figure_11_distserve_high.html")

In [81]:
# Plot the `figure_11_vllm_high`for some configurations
# tp_prefill = 1, pp_prefill = 1
# x-axis: rate
# y-axis: attainment
# find all combination of tp_prefill, pp_prefill
import plotly.graph_objects as go

fig = go.Figure()
configs = figure_11_vllm_high[['tp_prefill', 'pp_prefill']].drop_duplicates()
df = figure_11_vllm_high

for tp_prefill, pp_prefill in configs.values:
    config_df = df[
        (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
        ]
    # plot this inside a plotly plot
    fig.add_trace(go.Scatter(
        x=config_df['per_gpu_rate'], y=config_df['attainment'],
        mode='lines+markers', name=f"p{tp_prefill}{pp_prefill}-vllm"
    ))

# fig add title
fig.update_layout(
    title="vLLM++",
    xaxis_title="Per-GPU Rate (tokens/s)",
    yaxis_title="Attainment (%)",
    legend_title="Configuration"
)
fig.show()
# Export to html
fig.write_html("visual/figure_11_vllm_high.html")

In [73]:
import plotly.graph_objects as go

fig = go.Figure()
configs = figure_11_vllm_high[['tp_prefill', 'pp_prefill', 'tp_decode', 'pp_decode']].drop_duplicates()

# Case 1: DistServe-High
df = figure_11_distserve_high
tp_prefill, pp_prefill, tp_decode, pp_decode = 2, 1, 1, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
    (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"disthigh-p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}"
))

# Case 2: DistServe-Low
df = figure_11_distserve_low
tp_prefill, pp_prefill, tp_decode, pp_decode = 2, 1, 1, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill) &
    (df['tp_decode'] == tp_decode) & (df['pp_decode'] == pp_decode)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"distlow-p{tp_prefill}{pp_prefill}{tp_decode}{pp_decode}"
))

# Case 3: vLLM++
df = figure_11_vllm_high
tp_prefill, pp_prefill = 1, 2
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"vllm++-p{tp_prefill}{pp_prefill}"
))

# Case 4: vLLM
df = figure_11_vllm_low
tp_prefill, pp_prefill = 1, 1
config_df = df[
    (df['tp_prefill'] == tp_prefill) & (df['pp_prefill'] == pp_prefill)
    ]
# plot this inside a plotly plot
fig.add_trace(go.Scatter(
    x=config_df['per_gpu_rate'], y=config_df['attainment'],
    mode='lines+markers', name=f"vllm-p{tp_prefill}{pp_prefill}"
))

fig.show()