# Analyse results of benchmark

And because i love it, we will do it using SQL & duckdb, yeah!

In [1]:
import duckdb
import pandas as pd
import plotly.express as px
import seaborn

In [2]:
all_data = duckdb.sql("FROM read_json('./data/benchmark_results.json');")
all_data

┌─────────────┬─────────────────────────┬──────────────┬──────────────┐
│ num_players │        function         │  time_in_s   │ memory_in_mb │
│    int64    │         varchar         │    double    │    double    │
├─────────────┼─────────────────────────┼──────────────┼──────────────┤
│          23 │ pandas_flatten          │  0.006476833 │     0.117059 │
│          23 │ polars_flatten          │    0.0013925 │     0.041149 │
│          23 │ manual_flatten          │   0.00049575 │     0.020264 │
│          23 │ generator_flatten       │  0.000605417 │     0.045581 │
│          23 │ unpack_operator_flatten │   5.1459e-05 │     0.019464 │
│          23 │ flatdict_flatten        │  0.005918792 │     0.066513 │
│          23 │ dlt_flatten             │  0.332055292 │     3.881898 │
│          46 │ pandas_flatten          │  0.007431292 │      0.20119 │
│          46 │ polars_flatten          │  0.001662958 │     0.061917 │
│          46 │ manual_flatten          │  0.000952042 │     0.0

In [3]:
duckdb.sql(f"""
SELECT 
    function,
    round(avg(time_in_s),4) as avg_time_in_s,
    round(avg(memory_in_mb), 4) as avg_memory_in_mb,
    round(max(memory_in_mb), 4) as max_memory_in_mb
FROM read_json('data/benchmark_results.json')
GROUP BY ALL
ORDER BY max_memory_in_mb 
""")

┌─────────────────────────┬───────────────┬──────────────────┬──────────────────┐
│        function         │ avg_time_in_s │ avg_memory_in_mb │ max_memory_in_mb │
│         varchar         │    double     │      double      │      double      │
├─────────────────────────┼───────────────┼──────────────────┼──────────────────┤
│ dlt_flatten             │        2.6409 │          15.1458 │          77.7315 │
│ unpack_operator_flatten │        0.0349 │           15.565 │          84.0011 │
│ manual_flatten          │        0.4011 │          15.5658 │          84.0019 │
│ generator_flatten       │         0.374 │          15.5778 │          84.0122 │
│ polars_flatten          │        0.5101 │          23.8617 │          128.785 │
│ flatdict_flatten        │        4.6239 │           35.135 │         189.5238 │
│ pandas_flatten          │        2.4207 │          67.9431 │          366.539 │
└─────────────────────────┴───────────────┴──────────────────┴──────────────────┘

In [5]:
duckdb.sql("""
SELECT 
    function,
    round(avg(time_in_s),4) as avg_time_in_s,
    round(avg(memory_in_mb), 4) as avg_memory_in_mb,
    round(median(memory_in_mb), 4) as median_memory_in_mb
FROM read_json('data/benchmark_results.json')
GROUP BY 1
ORDER BY median_memory_in_mb 
""")

┌─────────────────────────┬───────────────┬──────────────────┬─────────────────────┐
│        function         │ avg_time_in_s │ avg_memory_in_mb │ median_memory_in_mb │
│         varchar         │    double     │      double      │       double        │
├─────────────────────────┼───────────────┼──────────────────┼─────────────────────┤
│ unpack_operator_flatten │        0.0349 │           15.565 │              0.4626 │
│ manual_flatten          │        0.4011 │          15.5658 │              0.4634 │
│ generator_flatten       │         0.374 │          15.5778 │              0.4735 │
│ polars_flatten          │        0.5101 │          23.8617 │              0.7061 │
│ flatdict_flatten        │        4.6239 │           35.135 │              1.0656 │
│ pandas_flatten          │        2.4207 │          67.9431 │              2.0543 │
│ dlt_flatten             │        2.6409 │          15.1458 │              2.4331 │
└─────────────────────────┴───────────────┴──────────────────┴───

In [6]:
duckdb.sql("""
SELECT 
    num_players,
    function,
    min(time_in_s) OVER(PARTITION BY num_players ORDER BY time_in_s ) as min_time,
    max(time_in_s) OVER(PARTITION BY num_players ORDER BY time_in_s ) as max_time,
    avg(time_in_s) OVER(PARTITION BY num_players ORDER BY time_in_s ) as avg_time_in_s,
FROM read_json('data/benchmark_results.json')
ORDER BY max_time DESC;
""")

┌─────────────┬─────────────────────────┬─────────────┬──────────────┬───────────────────────┐
│ num_players │        function         │  min_time   │   max_time   │     avg_time_in_s     │
│    int64    │         varchar         │   double    │    double    │        double         │
├─────────────┼─────────────────────────┼─────────────┼──────────────┼───────────────────────┤
│      100000 │ flatdict_flatten        │ 0.194838666 │ 24.955957042 │     8.449056970142857 │
│      100000 │ dlt_flatten             │ 0.194838666 │  13.90697375 │     5.697906958166667 │
│      100000 │ pandas_flatten          │ 0.194838666 │ 13.098255041 │          4.0560935998 │
│      100000 │ polars_flatten          │ 0.194838666 │  2.775296542 │    1.7955532394999998 │
│       10000 │ flatdict_flatten        │ 0.013035792 │  2.494499125 │    0.8321502324285713 │
│      100000 │ manual_flatten          │ 0.194838666 │  2.176526375 │    1.4689721386666665 │
│      100000 │ generator_flatten       │ 0.194838

# Visualizations

In [5]:
df = all_data.df()
print(type(df))
all_data.columns

<class 'pandas.core.frame.DataFrame'>


['num_players', 'function', 'time_in_s', 'memory_in_mb']

In [6]:
# define the template variables to be reused 

template='plotly_dark'
height=600
width=900
labels={
        'num_players': 'Number of players',
        'memory_in_mb': 'Memory usage (MB)',
        'function': 'Functions',
        'time_in_s': 'Execution time (seconds)'
    }

In [7]:
df = all_data.df()

df_max_players = df.loc[df['num_players'] >= 100000].copy()
df_max_players['time_in_s'] = df_max_players['time_in_s'].round(2)
df_max_players['memory_in_mb'] = df_max_players['memory_in_mb'].round(2) 

h_bar = px.scatter(
    df_max_players,
    title='Relation memory usage & execution time for 100k players (log scale)',
    x='time_in_s',
    y='memory_in_mb',
    color='function',
    size='memory_in_mb',
    size_max=50,
    log_x=True,
    log_y=True,
    template=template,
    labels=labels,
    height=height,
    width=width
)

h_bar.show()


## Memory usage


In [10]:
import plotly.express as px
import pandas as pd

# Assuming all_data is your DataFrame with columns: function, num_players, memory_in_mb
df = all_data.df()

# Create a line chart
fig = px.line(
    df,
    x='num_players',
    y='memory_in_mb',
    color='function',
    markers=True,  
    title='Memory Usage by Number of Players (log scale)',
    labels=labels,
    log_x=True,
    log_y=True,
    template=template,
    height=height,
    width=width
)

fig.show()


In [11]:
bar_time = px.bar(
    df_max_players.sort_values('memory_in_mb'),
    title='Memory usage for 100k players (in MB)',
    labels=labels,
    x='memory_in_mb',
    y='function',
    color='function',
    #barmode='group',
    text_auto='.3s',
    orientation='h',
    template=template,
    height=height,
    width=width
)

bar_time.show()

In [12]:
# create a df with only 100 & 100k players columns
df_hundred = df.loc[df['num_players'].isin([100, 100_000])]

# Create a grouped bar chart
fig = px.bar(
    df_hundred.sort_values('memory_in_mb'),
    x='function',
    y='memory_in_mb',
    color='function',
    text_auto='.2s',
    facet_col='num_players',  # Create separate subplot for each player count
    facet_col_wrap=2,  # Number of subplots per row
    title='Memory Usage by Function for Different Player Counts',
    labels=labels,
    template=template,
    height=height,
    width=width,
   
)
# Update facet titles to show player count more clearly
fig.for_each_annotation(lambda a: a.update(text=f"{a.text.split('=')[1]} players"))

# Adjust y-axis ranges independently
for i, num_players in enumerate([100, 100_000]):
    fig.update_yaxes(matches=None)  # Disable y-axis matching

fig.show()


In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Filter data for the two player counts
df_100 = df[df['num_players'] == 100].copy()
df_100k = df[df['num_players'] == 100_000].copy()

# Sort both dataframes by the same order (e.g., by function name)
df_100 = df_100.sort_values('function')
df_100k = df_100k.sort_values('function')

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])


# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces for 100 players (left y-axis)
fig.add_trace(
    go.Scatter(
        x=df_100['function'],
        y=df_100['memory_in_mb'],
        name="100 Players",
        mode='lines+markers',
        marker=dict(size=15),
        line=dict(width=2, color='pink')
    ),
    secondary_y=False,
)

# Add traces for 100,000 players (right y-axis)
fig.add_trace(
    go.Scatter(
        x=df_100k['function'],
        y=df_100k['memory_in_mb'],
        name="100,000 Players",
        mode='lines+markers',
        marker=dict(size=15),
        line=dict(width=2, color='red')
    ),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text="Memory Usage Comparison: 100 vs 100,000 Players",
    template=template,
    height=height,
    width=width
)

# Set y-axes titles
fig.update_yaxes(title_text="Memory (MB) - 100 Players", secondary_y=False)
fig.update_yaxes(title_text="Memory (MB) - 100,000 Players", secondary_y=True)

fig.show()


## Execution Time 


In [4]:
bar_time = px.bar(
    df_hundred.sort_values(by='time_in_s'),
    x='time_in_s',
    y='function',
    color='function',
    #barmode='group',
    text_auto='.3s',
    orientation='h',
    template=template,
    height=height,
    width=width
)

bar_time.show()

NameError: name 'df_hundred' is not defined

In [15]:
# Calculate time per player
df_100['time_per_player'] = df_100['time_in_s'] / 100
df_100k['time_per_player'] = df_100k['time_in_s'] / 100_000

# Create figure
fig = go.Figure()

# Add traces for time per player
fig.add_trace(
    go.Bar(
        x=df_100['function'],
        y=df_100['time_per_player'],
        name="100 Players",
        marker_color='pink',
        opacity=0.7
    )
)

fig.add_trace(
    go.Bar(
        x=df_100k['function'],
        y=df_100k['time_per_player'],
        name="100,000 Players",
        marker_color='red',
        opacity=0.7
    )
)

# Update layout
fig.update_layout(
    title_text="Time Efficiency (Seconds per Player)",
    template='plotly_dark',
    height=600,
    width=900,
    barmode='group',
    yaxis=dict(title="Time per Player (s)"),
    xaxis=dict(title="Function"),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()


---

# Bonus: Analyse the loop style

In [16]:
duckdb.sql("""
SELECT 
    function,
    round(avg(time_in_s),4) as avg_time_in_s,
    round(median(time_in_s),4) as median_time_in_s,
    round(avg(memory_in_mb), 4) as avg_memory_in_mb,
    round(median(memory_in_mb), 4) as median_memory

FROM read_json('data/compare_loops_results.json')
GROUP BY 1
ORDER BY avg_time_in_s 
""")

┌─────────────────────────────────────┬───────────────┬──────────────────┬──────────────────┬───────────────┐
│              function               │ avg_time_in_s │ median_time_in_s │ avg_memory_in_mb │ median_memory │
│               varchar               │    double     │      double      │      double      │    double     │
├─────────────────────────────────────┼───────────────┼──────────────────┼──────────────────┼───────────────┤
│ flatdict_flatten_gen_comprehension  │         0.121 │           0.0057 │          18.6758 │         0.847 │
│ flatdict_flatten_list_comprehension │        0.1298 │           0.0056 │          18.6754 │        0.8465 │
│ flatdict_flatten                    │        5.1377 │           0.2339 │          42.1424 │        1.9194 │
└─────────────────────────────────────┴───────────────┴──────────────────┴──────────────────┴───────────────┘