In [132]:
import time
import tracemalloc
from functools import wraps
import pandas as pd
from flatdict import FlatDict
import dlt
import json
import polars as pl

from main import generate_sample_data, benchmark, flatdict_flatten

num_players = 23
match_details = generate_sample_data(num_players)
print(type(match_details))

<class 'dict'>


In [140]:
# test with duckdb

import duckdb 
import json
import polars as pl

def duckdb_flatten(match_details):
    """ flatten using sql & unnest function from duckdb"""
    generated_json = './data/generated_data.json'
    with open(generated_json, 'w') as f:
        json.dump(match_details, f)

    # unnest using duckdb sql
    player_stats = duckdb.sql(f"""
    with home as (
        select 
            unnest(home)
        from read_json_auto('{generated_json}')
        )
        select
            unnest(teamsheet, recursive := true)
        from home;
    """
    ).df()
    
    return player_stats.to_dict(orient='records')

duckdb_flatten(match_details)

[{'player_id': 1,
  'name': 'Player 1',
  'position': 'Forward',
  'substitute': False,
  'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'missed_penalty_goals': 1,
  'conversion_goals': 1,
  'missed_conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'player_id': 2,
  'name': 'Player 2',
  'position': 'Forward',
  'substitute': False,
  'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover': 2,
  'penalty_goals': 0,
  'missed_penalty_goals': 0,
  'c

In [126]:
def pandas_flatten(match_details):
    """Flattens player stats using pandas."""

    player_list = match_details.get('home', {}).get('teamsheet', [])
    df = pd.json_normalize(player_list, sep='_')
    df = df.rename(columns=lambda x: x.replace('match_stats_', ''))

    return df.to_dict(orient='records')

pandas_flatten(match_details)

[{'player_id': 1,
  'name': 'Player 1',
  'position': 'Forward',
  'substitute': False,
  'points': 5,
  'tries': 1,
  'turnovers_conceded': 1,
  'offload': 1,
  'dominant_tackles': 1,
  'missed_tackles': 1,
  'tackle_success': 0.86,
  'tackle_try_saver': 1,
  'tackle_turnover': 1,
  'penalty_goals': 1,
  'missed_penalty_goals': 1,
  'conversion_goals': 1,
  'missed_conversion_goals': 1,
  'drop_goals_converted': 0,
  'drop_goal_missed': 1,
  'runs': 6,
  'metres': 48,
  'clean_breaks': 1,
  'defenders_beaten': 1,
  'try_assists': 1,
  'passes': 11,
  'bad_passes': 1,
  'rucks_won': 1,
  'rucks_lost': 1,
  'lineouts_won': 1,
  'penalties_conceded': 1},
 {'player_id': 2,
  'name': 'Player 2',
  'position': 'Forward',
  'substitute': False,
  'points': 10,
  'tries': 2,
  'turnovers_conceded': 2,
  'offload': 2,
  'dominant_tackles': 2,
  'missed_tackles': 2,
  'tackle_success': 0.87,
  'tackle_try_saver': 0,
  'tackle_turnover': 2,
  'penalty_goals': 0,
  'missed_penalty_goals': 0,
  'c

In [None]:
# test with duckdb

import duckdb
import json
import fsspec

conn = duckdb.connect() 
mem_fs = fsspec.filesystem("memory")

player_list = match_details.get('home', {}).get('teamsheet', [])

# 2. Write the Python list to a JSON file in the memory filesystem
file_path = "players.json"
with mem_fs.open(file_path, 'w') as f:
    f.write(json.dumps(player_list))

# 3. Register the fsspec filesystem with the DuckDB connection
#    This teaches DuckDB how to handle the 'memory://' protocol
conn.register_filesystem(mem_fs)

# 4. Construct the query to read from the memory file
query = f"""
WITH t as (
SELECT
    *
FROM read_json('memory://{file_path}')
)
select 
    *,
    unnest(match_stats)
from t
"""

# 5. Execute and fetch the results
result = conn.sql(query).fetchall()
result

## using JMESPath


In [None]:
# test with JMESPath 
# https://jmespath.org/tutorial.html

import jmespath

@benchmark
def jmespath_flatten(match_details):
    """
    Flatten by first extracting the player list with JMESPath,
    then uses a Python list comprehension to perform the final merge.
    """
    teamsheet_query = 'home.teamsheet'
    player_list = jmespath.search(teamsheet_query, match_details) or []

    player_stats = []
    for player in player_list:
        
        stats_to_flatten = jmespath.search('match_stats', player) or {}
        
        new_player_row = {
            'player_id': player.get('player_id'),
            'name': player.get('name'),
            'position': player.get('position'),
            'substitute': player.get('substitute'),
            **stats_to_flatten
        }
        player_stats.append(new_player_row)
    return player_stats
   

jmespath_flatten(match_details)