In [6]:
import time
import tracemalloc
from functools import wraps
import pandas as pd
from flatdict import FlatDict
import dlt
import json

num_players = 23

def generate_sample_data(num_players):
    """
    Generates a larger match_details dictionary with rugby player individual stats.
    This imitate a real-life API endpoint.
    """

    players = []
    for i in range(1, num_players + 1):
        if 1 <= i <= 8:
            position = 'Forward'
        elif 9 <= i <= 15:
            position = 'Back'
        else: # Substitutes doesnt matter for the example
            position = 'Substitute'
        # logic to define if player is a substitute
        is_substitute = i > 15

        player = {
            'player_id': i,
            'name': f'Player {i}',
            'position': position,
            'substitute': is_substitute,
            'match_stats': {
                'points': (i % 3) * 5,
                'tries': i % 3,
                'turnovers_conceded': i % 4,
                'offload': i % 5,
                'dominant_tackles': i % 10,
                'missed_tackles': i % 5,
                'tackle_success': round(0.85 + (i % 15) / 100, 2),
                'tackle_try_saver': i % 2,
                'tackle_turnover': i % 3,
                'penalty_goals': i % 2,
                'missed_penalty_goals': i % 2,
                'conversion_goals': i % 4,
                'missed_conversion_goals': i % 4,
                'drop_goals_converted': i % 1,
                'drop_goal_missed': i % 2,
                'runs': i % 20 + 5,
                'metres': (i % 20 + 5) * 8,
                'clean_breaks': i % 4,
                'defenders_beaten': i % 6,
                'try_assists': i % 2,
                'passes': i % 30 + 10,
                'bad_passes': i % 5,
                'rucks_won': i % 15,
                'rucks_lost': i % 3,
                'lineouts_won': i % 4,
                'penalties_conceded': i % 3
            }
        }
        players.append(player)

    return {
        'match_id': 12345,
        'date': '2025-07-27',
        'venue': 'Small Mem Stadium',
        'home': {
            'team_id': 101,
            'team_name': 'The Bloody Ingestors',
            'teamsheet': players
        },
        'away': {}
    }

# --- Benchmarking Setup ---
results_list = []

def benchmark(func):
    """Decorator to measure and store performance."""
    @wraps(func)
    def wrapper(*args, **kwargs):
        tracemalloc.start()
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        results_list.append({
            'function': func.__name__,
            'time_in_s': end_time - start_time,
            'memory_in_mb': peak / 10**6,
        })
        return result
    return wrapper

In [94]:
match_details = generate_sample_data(num_players)

import duckdb
import json
import fsspec

player_list = match_details.get('home', {}).get('teamsheet', [])


conn = duckdb.connect() 
mem_fs = fsspec.filesystem("memory")

# 2. Write the Python list to a JSON file in the memory filesystem
file_path = "players.json"
with mem_fs.open(file_path, 'w') as f:
    f.write(json.dumps(player_list))

# 3. Register the fsspec filesystem with the DuckDB connection
#    This teaches DuckDB how to handle the 'memory://' protocol
conn.register_filesystem(mem_fs)

# 4. Construct the query to read from the memory file
#    The FROM clause is the only part that changes.
query = f"""
WITH t as (
SELECT
    *
FROM read_json('memory://{file_path}')
)
select 
    *,
    unnest(match_stats)
from t
"""

# 5. Execute and fetch the results
result = conn.sql(query).fetchall()

result

[(1,
  'Player 1',
  'Forward',
  False,
  {'points': 5,
   'tries': 1,
   'turnovers_conceded': 1,
   'offload': 1,
   'dominant_tackles': 1,
   'missed_tackles': 1,
   'tackle_success': 0.86,
   'tackle_try_saver': 1,
   'tackle_turnover': 1,
   'penalty_goals': 1,
   'missed_penalty_goals': 1,
   'conversion_goals': 1,
   'missed_conversion_goals': 1,
   'drop_goals_converted': 0,
   'drop_goal_missed': 1,
   'runs': 6,
   'metres': 48,
   'clean_breaks': 1,
   'defenders_beaten': 1,
   'try_assists': 1,
   'passes': 11,
   'bad_passes': 1,
   'rucks_won': 1,
   'rucks_lost': 1,
   'lineouts_won': 1,
   'penalties_conceded': 1},
  5,
  1,
  1,
  1,
  1,
  1,
  0.86,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  6,
  48,
  1,
  1,
  1,
  11,
  1,
  1,
  1,
  1,
  1),
 (2,
  'Player 2',
  'Forward',
  False,
  {'points': 10,
   'tries': 2,
   'turnovers_conceded': 2,
   'offload': 2,
   'dominant_tackles': 2,
   'missed_tackles': 2,
   'tackle_success': 0.87,
   'tackle_try_saver': 0,
   

In [None]:
@benchmark
def duckdb_flatten(match_details):
    """
    Performs the flattening operation using DuckDB's embedded SQL engine.
    """
    player_list = match_details.get('home', {}).get('teamsheet', [])

    if not player_list:
        return []

    query = f"""
         SELECT 
            UNNEST(
                FROM_JSON({player_list}, '["JSON"]')
            )
            """
    try:
        result = duckdb.sql(query).to_df().to_dict(orient='records')
    except Exception as e:
        print(f"DuckDB Error: {e}")
        return []
        
    return result