In [2]:
import time
import tracemalloc
from functools import wraps
import pandas as pd
from flatdict import FlatDict
import dlt
import json
import polars as pl

from main import generate_sample_data, benchmark, flatdict_flatten

num_players = 100
match_details = generate_sample_data(num_players)
game_details = generate_sample_data(num_players)

In [4]:
def generate_sample_data(num_players):
    """
    Generates a match_details dictionary with rugby player individual stats.
    This imitate a real-life API endpoint.
    """
    players = []
    for i in range(1, num_players + 1):
        if 1 <= i <= 8:
            position = 'Forward'
        elif 9 <= i <= 15:
            position = 'Back'
        else: # Substitutes doesnt matter for the example
            position = 'Substitute'
        # logic to define if player is a substitute
        is_substitute = i > 15

        player = {
            'player_id': i,
            'name': f'Player {i}',
            'position': position,
            'substitute': is_substitute,
            'match_stats': {
                'points': (i % 3) * 5,
                'tries': i % 3,
                'turnovers_conceded': i % 4,
                'offload': i % 5,
                'dominant_tackles': i % 10,
                'missed_tackles': i % 5,
                'tackle_success': round(0.85 + (i % 15) / 100, 2),
                'tackle_try_saver': i % 2,
                'tackle_turnover': i % 3,
                'penalty_goals': i % 2,
                'missed_penalty_goals': i % 2,
                'conversion_goals': i % 4,
                'missed_conversion_goals': i % 4,
                'drop_goals_converted': i % 1,
                'drop_goal_missed': i % 2,
                'runs': i % 20 + 5,
                'metres': (i % 20 + 5) * 8,
                'clean_breaks': i % 4,
                'defenders_beaten': i % 6,
                'try_assists': i % 2,
                'passes': i % 30 + 10,
                'bad_passes': i % 5,
                'rucks_won': i % 15,
                'rucks_lost': i % 3,
                'lineouts_won': i % 4,
                'penalties_conceded': i % 3
            }
        }
        players.append(player)

    return {
        'match_id': 12345,
        'date': '2025-07-27',
        'venue': 'Small Mem Stadium',
        'home': {
            'team_id': 101,
            'team_name': 'The Bloody Ingestors',
            'teamsheet': players
        },
        'away': {}
    }

player_dict = generate_sample_data(num_players)

In [24]:
teamsheet = player_dict.get('home').get('teamsheet')
teamsheet = [
    {
    **player.get('match_stats')
    }
    for player in teamsheet
    ]

teamsheet

[{'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'missed_penalty_goals': 1,
  'conversion_goals': 1,
  'missed_conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover': 2,
  'penalty_goals': 0,
  'missed_penalty_goals': 0,
  'conversion_goals': 2,
  'missed_conversion_goals': 2,
  'drop_goals_converted': 0,
  'drop_goal_missed': 0,
  'runs': 7,
  'metres': 56,
  'clean_breaks': 2,
  'defenders_beaten

In [5]:
@benchmark
def manual_flatten(match_details):
    """
    A pure nonesense manual function that flattens the player stats using a list comprehension.
    Used as reference for very static way !
    """
    player_list = match_details.get('home', {}).get('teamsheet', [])

    player_stats = [
        {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            'points': player.get('match_stats', {}).get('points', None),
            'tries': player.get('match_stats', {}).get('tries', None),
            'turnovers_conceded': player.get('match_stats', {}).get('turnovers_conceded', None),
            'offload': player.get('match_stats', {}).get('offload', None),
            'dominant_tackles': player.get('match_stats', {}).get('dominant_tackles', None),
            'missed_tackles': player.get('match_stats', {}).get('missed_tackles', None),
            'tackle_success': player.get('match_stats', {}).get('tackle_success', None),
            'tackle_try_saver': player.get('match_stats', {}).get('tackle_try_saver', None),
            'tackle_turnover': player.get('match_stats', {}).get('tackle_turnover', None),
            'penalty_goals': player.get('match_stats', {}).get('penalty_goals', None),
            'missed_penalty_goals': player.get('match_stats', {}).get('missed_penalty_goals', None),
            'conversion_goals': player.get('match_stats', {}).get('conversion_goals', None),
            'missed_conversion_goals': player.get('match_stats', {}).get('missed_conversion_goals', None),
            'drop_goals_converted': player.get('match_stats', {}).get('drop_goals_converted', None),
            'drop_goal_missed': player.get('match_stats', {}).get('drop_goal_missed', None),
            'runs': player.get('match_stats', {}).get('runs', None),
            'metres': player.get('match_stats', {}).get('metres', None),
            'clean_breaks': player.get('match_stats', {}).get('clean_breaks', None),
            'defenders_beaten': player.get('match_stats', {}).get('defenders_beaten', None),
            'try_assists': player.get('match_stats', {}).get('try_assists', None),
            'passes': player.get('match_stats', {}).get('passes', None),
            'bad_passes': player.get('match_stats', {}).get('bad_passes', None),
            'rucks_won': player.get('match_stats', {}).get('rucks_won', None),
            'rucks_lost': player.get('match_stats', {}).get('rucks_lost', None),
            'lineouts_won': player.get('match_stats', {}).get('lineouts_won', None),
            'penalties_conceded': player.get('match_stats', {}).get('penalties_conceded', None)
        }
        for player in player_list
    ]

    return player_stats


print(len(manual_flatten(game_details)))
(manual_flatten(game_details))[:2]

100


[{'player_id': 1,
  'name': 'Player 1',
  'position': 'Forward',
  'substitute': False,
  'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'missed_penalty_goals': 1,
  'conversion_goals': 1,
  'missed_conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'player_id': 2,
  'name': 'Player 2',
  'position': 'Forward',
  'substitute': False,
  'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover': 2,
  'penalty_goals': 0,
  'missed_penalty_goals': 0,
  'c

In [None]:
def unpack_operator_flatten(match_details):
    """
    A manual function that flattens the player stats using 
    a pythonic list comprehension method and the unpack operator
    """
    player_list = match_details.get('home', {}).get('teamsheet', [])

    player_stats = [
        {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            **player.get('match_stats', {}),         # we use unpacking operator here on the the nested dictionary
        }
        for player in player_list
    ]

    return player_stats

unpack_operator_flatten(game_details)

In [34]:
def generator_flatten_comprehension(match_details):
    """
    An application-specific function that uses the python generator
    to process the player data.
    """
    from collections.abc import MutableMapping

    def flatten_gen(d, parent_key, sep):
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, MutableMapping):
                yield from flatten_gen(v, new_key, sep)
            else:
                yield new_key, v

    # This is the user-facing utility function that starts the process.
    def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '_'):
        """A generic utility to flatten a single dictionary."""
        return dict(flatten_gen(d, parent_key, sep))

    player_list = match_details.get('home', {}).get('teamsheet', [])
   # Process each player with a list comprehension
    player_stats = [
        {**{k: v for k, v in player.items() if k != 'match_stats'}, 
         **flatten_dict(player['match_stats'], sep='_')} 
        for player in player_list 
        if 'match_stats' in player and isinstance(player['match_stats'], MutableMapping)
    ]
    
    return player_stats

generator_flatten_comprehension(game_details)

[]

In [4]:
def pandas_flatten(match_details):
    """Flattens player stats using pandas."""

    player_list = match_details.get('home', {}).get('teamsheet', [])
    df = pd.json_normalize(player_list, sep='_')
    df = df.rename(columns=lambda x: x.replace('match_stats_', ''))

    return df.to_dict(orient='records')

print(len(pandas_flatten(match_details)))
pandas_flatten(match_details)[:3]

100


[{'player_id': 1,
  'name': 'Player 1',
  'position': 'Forward',
  'substitute': False,
  'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'missed_penalty_goals': 1,
  'conversion_goals': 1,
  'missed_conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'player_id': 2,
  'name': 'Player 2',
  'position': 'Forward',
  'substitute': False,
  'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover': 2,
  'penalty_goals': 0,
  'missed_penalty_goals': 0,
  'c

In [6]:
def flatdict_flatten(match_details):
    """Flattens player stats using the flatdict library."""

    player_list = match_details.get('home', {}).get('teamsheet', [])
    player_stats = [
        player.update(FlatDict(player.pop('match_stats', {}), delimiter='_')) or player
        for player in player_list
    ]

    return player_stats

flatdict_flatten(match_details)

[{'player_id': 1,
  'name': 'Player 1',
  'position': 'Forward',
  'substitute': False,
  'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'missed_penalty_goals': 1,
  'missed_conversion_goals': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'player_id': 2,
  'name': 'Player 2',
  'position': 'Forward',
  'substitute': False,
  'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'missed_penalty_goals': 0,
  'missed_conversion_goals': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover

In [None]:
# test with JMESPath 
# https://jmespath.org/tutorial.html

import jmespath

@benchmark
def jmespath_flatten(match_details):
    """
    Flatten by first extracting the player list with JMESPath,
    then uses a Python list comprehension to perform the final merge.
    """
    teamsheet_query = 'home.teamsheet'
    player_list = jmespath.search(teamsheet_query, match_details) or []

    player_stats = []
    for player in player_list:
        
        stats_to_flatten = jmespath.search('match_stats', player) or {}
        
        new_player_row = {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            **stats_to_flatten
        }
        player_stats.append(new_player_row)
    return player_stats
   

jmespath_flatten(match_details)

In [11]:
# test with polars
@benchmark
def polars_flatten(match_details):
    """Flattens player stats using polars."""

    player_list = match_details.get('home', {}).get('teamsheet', [])
    df = pl.DataFrame(player_list)
    flat_df = df.unnest('match_stats')
    return flat_df.to_dicts()

print(len(polars_flatten(match_details)))

100


In [None]:
@benchmark
def jmespath_idiomatic_flatten(match_details):
    """
    Performs flattening using the idiomatic JMESPath hybrid pattern:
    1. Extracts all top-level player info in one query.
    2. Extracts all nested stats in a second query.
    3. Merges the two resulting lists in Python using zip().
    """
    
    # Query 1: Extract only the top-level keys from each player.
    # The multiselect hash `{...}` creates a new dictionary for each player.
    top_level_query = "home.teamsheet[].{player_id: player_id, name: name, position: position, substitute: substitute}"
    
    # Query 2: Extract ONLY the match_stats dictionary from each player.
    stats_query = "home.teamsheet[].match_stats"

    # --- EXECUTE EXTRACTION ---
    top_level_players = jmespath.search(top_level_query, match_details) or []
    all_match_stats = jmespath.search(stats_query, match_details) or []

    if not top_level_players:
        return []

    # --- PERFORM ASSEMBLY IN PYTHON ---
    # zip() pairs the first top_level_player with the first stats_dict, and so on.
    # The final list comprehension merges each pair into a new flat dictionary.
    return [
        {**(top_level or {}), **(stats or {})}
        for top_level, stats in zip(top_level_players, all_match_stats)
    ]
    
jmespath_idiomatic_flatten(match_details)

[]

In [None]:
# test with duckdb
"""
import duckdb
import json
import fsspec

player_list = match_details.get('home', {}).get('teamsheet', [])


conn = duckdb.connect() 
mem_fs = fsspec.filesystem("memory")

# 2. Write the Python list to a JSON file in the memory filesystem
file_path = "players.json"
with mem_fs.open(file_path, 'w') as f:
    f.write(json.dumps(player_list))

# 3. Register the fsspec filesystem with the DuckDB connection
#    This teaches DuckDB how to handle the 'memory://' protocol
conn.register_filesystem(mem_fs)

# 4. Construct the query to read from the memory file
#    The FROM clause is the only part that changes.
query = f"""
WITH t as (
SELECT
    *
FROM read_json('memory://{file_path}')
)
select 
    *,
    unnest(match_stats)
from t
"""

# 5. Execute and fetch the results
result = conn.sql(query).fetchall()

result
"""


[(1,
  'Player 1',
  'Forward',
  False,
  {'points': 5,
   'tries': 1,
   'turnovers_conceded': 1,
   'offload': 1,
   'dominant_tackles': 1,
   'missed_tackles': 1,
   'tackle_success': 0.86,
   'tackle_try_saver': 1,
   'tackle_turnover': 1,
   'penalty_goals': 1,
   'missed_penalty_goals': 1,
   'conversion_goals': 1,
   'missed_conversion_goals': 1,
   'drop_goals_converted': 0,
   'drop_goal_missed': 1,
   'runs': 6,
   'metres': 48,
   'clean_breaks': 1,
   'defenders_beaten': 1,
   'try_assists': 1,
   'passes': 11,
   'bad_passes': 1,
   'rucks_won': 1,
   'rucks_lost': 1,
   'lineouts_won': 1,
   'penalties_conceded': 1},
  5,
  1,
  1,
  1,
  1,
  1,
  0.86,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  6,
  48,
  1,
  1,
  1,
  11,
  1,
  1,
  1,
  1,
  1),
 (2,
  'Player 2',
  'Forward',
  False,
  {'points': 10,
   'tries': 2,
   'turnovers_conceded': 2,
   'offload': 2,
   'dominant_tackles': 2,
   'missed_tackles': 2,
   'tackle_success': 0.87,
   'tackle_try_saver': 0,
   

In [None]:
@benchmark
def duckdb_flatten(match_details):
    """
    Performs the flattening operation using DuckDB's embedded SQL engine.
    """
    player_list = match_details.get('home', {}).get('teamsheet', [])

    if not player_list:
        return []

    query = f"""
         SELECT 
            UNNEST(
                FROM_JSON({player_list}, '["JSON"]')
            )
            """
    try:
        result = duckdb.sql(query).to_df().to_dict(orient='records')
    except Exception as e:
        print(f"DuckDB Error: {e}")
        return []
        
    return result