# Analyse results of benchmark

And because i love it, we will do it using SQL & duckdb, yeah!

In [1]:
import duckdb
import pandas as pd
import plotly.express as px
import seaborn

In [2]:
all_data = duckdb.sql("FROM read_json('./data/benchmark_results.json');")
all_data

┌─────────────┬─────────────────────────┬──────────────┬──────────────┐
│ num_players │        function         │  time_in_s   │ memory_in_mb │
│    int64    │         varchar         │    double    │    double    │
├─────────────┼─────────────────────────┼──────────────┼──────────────┤
│          23 │ pandas_flatten          │   0.00797825 │     0.117009 │
│          23 │ manual_flatten          │   0.00052525 │     0.020264 │
│          23 │ generator_flatten       │  0.000573125 │     0.038011 │
│          23 │ unpack_operator_flatten │   3.8083e-05 │     0.019464 │
│          23 │ flatdict_flatten        │  0.005893792 │     0.066489 │
│          23 │ dlt_flatten             │  0.247817834 │     3.737361 │
│         100 │ pandas_flatten          │  0.011148958 │     0.404842 │
│         100 │ manual_flatten          │  0.002056917 │        0.085 │
│         100 │ generator_flatten       │  0.001848416 │     0.095168 │
│         100 │ unpack_operator_flatten │  0.000132291 │       0

In [3]:
duckdb.sql(f"""
SELECT 
    function,
    round(avg(time_in_s),4) as avg_time_in_s,
    round(avg(memory_in_mb), 4) as avg_memory_in_mb,
    round(max(memory_in_mb), 4) as max_memory_in_mb
FROM read_json('data/benchmark_results.json')
GROUP BY ALL
ORDER BY max_memory_in_mb 
""")

┌─────────────────────────┬───────────────┬──────────────────┬──────────────────┐
│        function         │ avg_time_in_s │ avg_memory_in_mb │ max_memory_in_mb │
│         varchar         │    double     │      double      │      double      │
├─────────────────────────┼───────────────┼──────────────────┼──────────────────┤
│ dlt_flatten             │        5.0462 │          18.0581 │          77.6919 │
│ unpack_operator_flatten │        0.0292 │          18.6702 │          84.0011 │
│ manual_flatten          │        0.4723 │           18.671 │          84.0019 │
│ generator_flatten       │        0.4121 │          18.6827 │           84.012 │
│ flatdict_flatten        │        5.3928 │          42.1399 │         189.5238 │
│ pandas_flatten          │        2.8826 │          81.4916 │          366.539 │
└─────────────────────────┴───────────────┴──────────────────┴──────────────────┘

In [4]:
duckdb.sql("""
SELECT 
    function,
    round(avg(time_in_s),4) as avg_time_in_s,
    round(avg(memory_in_mb), 4) as avg_memory_in_mb,
    round(median(memory_in_mb), 4) as median_memory_in_mb
FROM read_json('data/benchmark_results.json')
GROUP BY 1
ORDER BY median_memory_in_mb 
""")

┌─────────────────────────┬───────────────┬──────────────────┬─────────────────────┐
│        function         │ avg_time_in_s │ avg_memory_in_mb │ median_memory_in_mb │
│         varchar         │    double     │      double      │       double        │
├─────────────────────────┼───────────────┼──────────────────┼─────────────────────┤
│ unpack_operator_flatten │        0.0292 │          18.6702 │              0.8409 │
│ manual_flatten          │        0.4723 │           18.671 │              0.8417 │
│ generator_flatten       │        0.4121 │          18.6827 │              0.8519 │
│ flatdict_flatten        │        5.3928 │          42.1399 │              1.9187 │
│ pandas_flatten          │        2.8826 │          81.4916 │               3.704 │
│ dlt_flatten             │        5.0462 │          18.0581 │              3.7374 │
└─────────────────────────┴───────────────┴──────────────────┴─────────────────────┘

In [15]:
duckdb.sql("""
SELECT 
    num_players,
    function,
    min(time_in_s) OVER(PARTITION BY num_players ORDER BY time_in_s ) as min_time,
    max(time_in_s) OVER(PARTITION BY num_players ORDER BY time_in_s ) as max_time,
    avg(time_in_s) OVER(PARTITION BY num_players ORDER BY time_in_s ) as avg_time_in_s,
FROM read_json('data/benchmark_results.json')
ORDER BY max_time DESC;
""")

┌─────────────┬─────────────────────────┬─────────────┬──────────────┬───────────────────────┐
│ num_players │        function         │  min_time   │   max_time   │     avg_time_in_s     │
│    int64    │         varchar         │   double    │    double    │        double         │
├─────────────┼─────────────────────────┼─────────────┼──────────────┼───────────────────────┤
│      100000 │ flatdict_flatten        │ 0.132211208 │ 24.256251291 │    10.613212326166666 │
│      100000 │ dlt_flatten             │ 0.132211208 │ 22.349448792 │     7.884604533199999 │
│      100000 │ pandas_flatten          │ 0.132211208 │ 12.962179625 │     4.268393468499999 │
│       10000 │ flatdict_flatten        │ 0.012166459 │  2.430822209 │    1.0793975001666667 │
│       10000 │ dlt_flatten             │ 0.012166459 │  2.322297875 │    0.8091125584000001 │
│      100000 │ manual_flatten          │ 0.132211208 │  2.126534375 │    1.3704647496666666 │
│      100000 │ generator_flatten       │ 0.132211

# Visualizations

In [14]:
df = all_data.df()
print(type(df))
all_data.columns

<class 'pandas.core.frame.DataFrame'>


['num_players', 'function', 'time_in_s', 'memory_in_mb']

In [None]:
template='plotly_dark'

df = 
h_bar = px.scatter(
    df,
    x='time_in_s',
    y='memory_in_mb',
    color='function',
    orientation='h',
    #barmode='group',
    template=template
)
h_bar.show()


In [10]:

line_chart = px.line(
    df,
    x='time_in_s',
    y='memory_in_mb',
    color='function'

)
line_chart.show()

---

# Bonus: Analyse the loop style

In [9]:
duckdb.sql("""
SELECT 
    function,
    round(avg(time_in_s),4) as avg_time_in_s,
    round(median(time_in_s),4) as median_time_in_s,
    round(avg(memory_in_mb), 4) as avg_memory_in_mb,
    round(median(memory_in_mb), 4) as median_memory

FROM read_json('data/compare_loops_results.json')
GROUP BY 1
ORDER BY avg_time_in_s 
""")

┌─────────────────────────────────────┬───────────────┬──────────────────┬──────────────────┬───────────────┐
│              function               │ avg_time_in_s │ median_time_in_s │ avg_memory_in_mb │ median_memory │
│               varchar               │    double     │      double      │      double      │    double     │
├─────────────────────────────────────┼───────────────┼──────────────────┼──────────────────┼───────────────┤
│ flatdict_flatten_gen_comprehension  │         0.121 │           0.0057 │          18.6758 │         0.847 │
│ flatdict_flatten_list_comprehension │        0.1298 │           0.0056 │          18.6754 │        0.8465 │
│ flatdict_flatten                    │        5.1377 │           0.2339 │          42.1424 │        1.9194 │
└─────────────────────────────────────┴───────────────┴──────────────────┴──────────────────┴───────────────┘