In [10]:
import time
import tracemalloc
from functools import wraps
import pandas as pd
from flatdict import FlatDict
import dlt
import json
import polars as pl

from main import generate_sample_data, benchmark, flatdict_flatten

num_players = 23
match_details = generate_sample_data(num_players)
print(type(match_details))

<class 'dict'>


In [None]:
@benchmark
def manual_flatten(match_details):
    """
    A pure nonesense manual function that flattens the player stats using a list comprehension.
    Used as reference for very static way !
    """
    player_list = match_details.get('home', {}).get('teamsheet', [])

    player_stats = [
        {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            'points': player.get('match_stats', {}).get('points', None),
            'tries': player.get('match_stats', {}).get('tries', None),
            'turnovers_conceded': player.get('match_stats', {}).get('turnovers_conceded', None),
            'offload': player.get('match_stats', {}).get('offload', None),
            'dominant_tackles': player.get('match_stats', {}).get('dominant_tackles', None),
            'missed_tackles': player.get('match_stats', {}).get('missed_tackles', None),
            'tackle_success': player.get('match_stats', {}).get('tackle_success', None),
            'tackle_try_saver': player.get('match_stats', {}).get('tackle_try_saver', None),
            'tackle_turnover': player.get('match_stats', {}).get('tackle_turnover', None),
            'penalty_goals': player.get('match_stats', {}).get('penalty_goals', None),
            'missed_penalty_goals': player.get('match_stats', {}).get('missed_penalty_goals', None),
            'conversion_goals': player.get('match_stats', {}).get('conversion_goals', None),
            'missed_conversion_goals': player.get('match_stats', {}).get('missed_conversion_goals', None),
            'drop_goals_converted': player.get('match_stats', {}).get('drop_goals_converted', None),
            'drop_goal_missed': player.get('match_stats', {}).get('drop_goal_missed', None),
            'runs': player.get('match_stats', {}).get('runs', None),
            'metres': player.get('match_stats', {}).get('metres', None),
            'clean_breaks': player.get('match_stats', {}).get('clean_breaks', None),
            'defenders_beaten': player.get('match_stats', {}).get('defenders_beaten', None),
            'try_assists': player.get('match_stats', {}).get('try_assists', None),
            'passes': player.get('match_stats', {}).get('passes', None),
            'bad_passes': player.get('match_stats', {}).get('bad_passes', None),
            'rucks_won': player.get('match_stats', {}).get('rucks_won', None),
            'rucks_lost': player.get('match_stats', {}).get('rucks_lost', None),
            'lineouts_won': player.get('match_stats', {}).get('lineouts_won', None),
            'penalties_conceded': player.get('match_stats', {}).get('penalties_conceded', None)
        }
        for player in player_list
    ]

    return player_stats

#(manual_flatten(match_details))[:2]

In [None]:
def unpack_operator_flatten(match_details):
    """
    A manual function that flattens the player stats using 
    a pythonic list comprehension method and the unpack operator
    """
    player_list = match_details.get('home', {}).get('teamsheet', [])

    player_stats = [
        {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            **player.get('match_stats', {}),         # we use unpacking operator here on the the nested dictionary
        }
        for player in player_list
    ]

    return player_stats

#unpack_operator_flatten(match_details)

In [None]:
def generator_flatten_comprehension(match_details):
    """
    An application-specific function that uses the python generator
    to process the player data.
    """
    from collections.abc import MutableMapping

    def flatten_gen(d, parent_key, sep):
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, MutableMapping):
                yield from flatten_gen(v, new_key, sep)
            else:
                yield new_key, v

    # This is the user-facing utility function that starts the process.
    def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '_'):
        """A generic utility to flatten a single dictionary."""
        return dict(flatten_gen(d, parent_key, sep))

    player_list = match_details.get('home', {}).get('teamsheet', [])
   # Process each player with a list comprehension
    player_stats = [
        {**{k: v for k, v in player.items() if k != 'match_stats'}, 
         **flatten_dict(player['match_stats'], sep='_')} 
        for player in player_list 
        if 'match_stats' in player and isinstance(player['match_stats'], MutableMapping)
    ]
    
    return player_stats

#generator_flatten_comprehension(match_details)

In [36]:
# test with duckdb

import duckdb 
import json

# create an intermediate json file 
generated_json = './data/generated_data.json'
with open(generated_json, 'w') as f:
    json.dump(match_details, f)

def duckdb_flatten(generated_json):
    "flatten using sql & unnest function from duckdb"

    # unnest using duckdb sql
    player_stats = duckdb.sql(f"""
    with home as (
        select 
            unnest(home)
        from read_json_auto('{generated_json}')
        )
        select
            unnest(teamsheet, recursive := true)
        from home;
    """
    ).df()
    
    return player_stats.to_dict(orient='records')

duckdb_flatten(generated_json)

[{'player_id': 1,
  'name': 'Player 1',
  'position': 'Forward',
  'substitute': False,
  'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'missed_penalty_goals': 1,
  'conversion_goals': 1,
  'missed_conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'player_id': 2,
  'name': 'Player 2',
  'position': 'Forward',
  'substitute': False,
  'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover': 2,
  'penalty_goals': 0,
  'missed_penalty_goals': 0,
  'c

In [None]:
def pandas_flatten(match_details):
    """Flattens player stats using pandas."""

    player_list = match_details.get('home', {}).get('teamsheet', [])
    df = pd.json_normalize(player_list, sep='_')
    df = df.rename(columns=lambda x: x.replace('match_stats_', ''))

    return df.to_dict(orient='records')

#pandas_flatten(match_details)

## using JMESPath


In [None]:
# test with JMESPath 
# https://jmespath.org/tutorial.html

import jmespath

@benchmark
def jmespath_flatten(match_details):
    """
    Flatten by first extracting the player list with JMESPath,
    then uses a Python list comprehension to perform the final merge.
    """
    teamsheet_query = 'home.teamsheet'
    player_list = jmespath.search(teamsheet_query, match_details) or []

    player_stats = []
    for player in player_list:
        
        stats_to_flatten = jmespath.search('match_stats', player) or {}
        
        new_player_row = {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            **stats_to_flatten
        }
        player_stats.append(new_player_row)
    return player_stats
   

jmespath_flatten(match_details)