In [1]:
import duckdb
import os

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

print("✓ DuckDB configured with S3 credentials")
print(f"Endpoint: {wasabi_endpoint}")
print(f"Access Key ID: {wasabi_access_key}")
print(f"Secret Access Key: {wasabi_secret_key}")
print('Bucket Name: ', bucket_name, '\n\n')

✓ DuckDB configured with S3 credentials
Endpoint: s3.us-east-2.wasabisys.com
Access Key ID: 60DLEZ81XUCI9ZNG8M3F
Secret Access Key: PuOySWLzPCtuts4IfXzWzdHRBefvGKgD0laHOb4B
Bucket Name:  dfscrunch-data-lake 




In [2]:
SPORT = "NFL"  # e.g., "NFL", "NBA", etc.
DATE = "2025-10-02"  # Format: YYYY-MM-DD
GAME_TYPE = "dk_single_game"  # e.g., "classic", "dk_single_game", etc.

# Maximum JSON object size (default is 16MB, we're setting to 64MB)
MAX_OBJECT_SIZE = 67108864  # 64mb in bytes

# Construct path
contest_analyze_path = f"s3://{bucket_name}/staging/{SPORT}/contest_analyze/{GAME_TYPE}/{DATE}/data.json.gz"

In [3]:
sample_df_description = con.execute(
    f"""
    DESCRIBE
    SELECT *
    FROM read_json_auto('{contest_analyze_path}', maximum_object_size={MAX_OBJECT_SIZE})
    limit 2
    """
).df()

In [4]:
sample_df_description

Unnamed: 0,column_name,column_type,null,key,default,extra
0,contest,"STRUCT(contestId BIGINT, contestName VARCHAR, ...",YES,,,
1,players,"STRUCT(""926:0"" STRUCT(playerId BIGINT, firstNa...",YES,,,
2,users,"MAP(VARCHAR, STRUCT(userId VARCHAR, totalPlaye...",YES,,,
3,salaries,"STRUCT(""100"" STRUCT(salaryCounts STRUCT(""49100...",YES,,,
4,flexUsage,"STRUCT(""100"" STRUCT(flexCounts STRUCT(RB STRUC...",YES,,,
5,cptUsage,"STRUCT(""100"" STRUCT(cptCounts STRUCT(RB STRUCT...",YES,,,
6,cptBreakdown,"STRUCT(""100"" STRUCT(favorite BIGINT, underdog ...",YES,,,
7,favoriteUsage,"STRUCT(""100"" STRUCT(favoriteCounts STRUCT(""2:4...",YES,,,
8,homeUsage,"STRUCT(""100"" STRUCT(homeCounts STRUCT(""2:4"" ST...",YES,,,
9,exposures,"STRUCT(""100"" STRUCT(exposureCounts STRUCT(""119...",YES,,,


In [5]:
dds_path = f"s3://{bucket_name}/dds/{SPORT}/players/{GAME_TYPE}/{DATE}/data.parquet"

In [None]:
con.execute(f"""
    COPY
        (SELECT DISTINCT
            kv.key as player_key,
            (kv.value->>'playerId')::INTEGER as player_id,
            kv.value->>'firstName' as first_name,
            kv.value->>'lastName' as last_name,
            kv.value->>'fullName' as full_name,
            (kv.value->>'salary')::INTEGER as salary,
            kv.value->>'position' as position,
            kv.value->>'rosterPosition' as roster_position,
            kv.value->>'currentTeam' as current_team,
            (kv.value->>'currentTeamId')::INTEGER as current_team_id,
            (kv.value->>'eventId')::INTEGER as event_id,
            (kv.value->>'eventTeamId')::INTEGER as event_team_id,
            kv.value->>'homeVisitor' as home_visitor,
            kv.value->>'favDog' as fav_dog,
            (kv.value->>'projPoints')::DOUBLE as proj_points,
            (kv.value->>'ownership')::DOUBLE as ownership,
            (kv.value->>'actualPoints')::DOUBLE as actual_points,
            kv.value->>'statDetails' as stat_details,
            (kv.value->>'madeCut')::INTEGER as made_cut
        FROM read_json_auto('{contest_analyze_path}', maximum_object_size={MAX_OBJECT_SIZE}),
             json_each(players) as kv
        ) TO '{dds_path}'
        (FORMAT PARQUET, COMPRESSION 'SNAPPY')
""")

In [None]:
players_df = con.execute(f"""
    SELECT
        player_id,
        position,
        roster_position,
        actual_points

    FROM
        read_parquet('{dds_path}')
        where proj
""").df()

In [None]:
players_df

In [6]:
con.execute(f"""
    CREATE OR REPLACE TABLE tmp_table AS
    SELECT contest, users
    FROM read_json_auto('{contest_analyze_path}', maximum_object_size={MAX_OBJECT_SIZE})
""")
users_df = con.execute(f"""
    select
      contest ->> 'contestId' as contest_id,
      kv.value ->> 'userId' as user_id,
      (kv.value ->> 'totalPlayers')::INTEGER as total_players,
      (kv.value ->> 'totalRosters')::INTEGER as total_rosters,
      (kv.value ->> 'uniqueRosters')::INTEGER as unique_rosters,
      (kv.value ->> 'maxExposure')::DOUBLE as max_exposure,
      (kv.value ->> 'lineupsCashing')::INTEGER as lineups_cashing,
      (kv.value ->> 'lineupsInPercentile1')::INTEGER as lineups_in_percentile_1,
      (kv.value ->> 'lineupsInPercentile2')::INTEGER as lineups_in_percentile_2,
      (kv.value ->> 'lineupsInPercentile5')::INTEGER as lineups_in_percentile_5,
      (kv.value ->> 'lineupsInPercentile10')::INTEGER as lineups_in_percentile_10,
      (kv.value ->> 'lineupsInPercentile20')::INTEGER as lineups_in_percentile_20,
      (kv.value ->> 'lineupsInPercentile50')::INTEGER as lineups_in_percentile_50,
      (kv.value ->> 'totalEntryCost')::DOUBLE as total_entry_cost,
      (kv.value ->> 'totalWinning')::DOUBLE as total_winning,
      (kv.value ->> 'roi')::DOUBLE as roi
    FROM tmp_table,
       json_each(users) as kv
""").df()
users_df.head(5)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,contest_id,user_id,total_players,total_rosters,unique_rosters,max_exposure,lineups_cashing,lineups_in_percentile_1,lineups_in_percentile_2,lineups_in_percentile_5,lineups_in_percentile_10,lineups_in_percentile_20,lineups_in_percentile_50,total_entry_cost,total_winning,roi
0,182808315,Bulvai911,6,1,1,100.0,0,0,0,0,0,0,0,20.0,0.0,-20.0
1,182808315,crimsonwolverine,6,1,1,100.0,0,0,0,0,0,0,0,20.0,0.0,-20.0
2,182808315,Kaseem714,6,1,1,100.0,0,0,0,0,0,0,0,20.0,0.0,-20.0
3,182808315,Springfield-25lee,6,1,1,100.0,0,0,0,0,0,0,0,20.0,0.0,-20.0
4,182808315,spades57,5,1,1,100.0,0,0,0,0,0,0,0,20.0,0.0,-20.0


In [4]:
con.execute(
    """
    -- Check current DuckDB settings
SELECT
    current_setting('threads') as threads,
    current_setting('memory_limit') as memory_limit,
    current_setting('max_memory') as max_memory;
    """
).df()

Unnamed: 0,threads,memory_limit,max_memory
0,16,50.1 GiB,50.1 GiB


In [13]:
lineups_df = con.execute(
    f"""
   -- Create a temporary table with pre-parsed users data
CREATE OR REPLACE TABLE users_parsed AS
SELECT
    contest ->> 'contestId' as contest_id,
    u.value as user_data
FROM tmp_table,
    json_each(users) as u;

-- Then query lineups from the parsed table
SELECT
    contest_id::INTEGER as contest_id,
    (user_data ->> 'userId')::TEXT as user_id,
    lineup.value ->> 'lineupHash' as lineup_hash,
    (lineup.value ->> 'lineupCt')::INTEGER as lineup_ct
FROM users_parsed,
    json_each(user_data->'lineups') as lineup;
"""
).df()
lineups_df.head(10)

Unnamed: 0,contest_id,user_id,lineup_hash,lineup_ct
0,182808315,Maloney42084,93290:33213:926:40377:25841:40805,1
1,182808315,Cemoto,93290:33213:926:40377:25841:40805,1
2,182808315,bchlvr160,93290:33213:926:40377:25841:40805,1
3,182808315,Whiterock,93290:33213:926:52089:25841:47245,1
4,182808315,djzato,926:93290:6957:33513:8491:139489,1
5,182808315,randy007mosley1992,926:93290:6957:8491:25841:139489,1
6,182808315,RyanLoudis,926:93290:6957:33513:8491:139489,1
7,182808315,kml650,926:93290:6957:33513:8491:139489,1
8,182808315,elite3200,4337:93290:33213:6957:926:8491,1
9,182808315,holtzhustler,4337:93290:33213:6957:926:8491,1


In [14]:
lineups_path = f"s3://{bucket_name}/dds/{SPORT}/lineups/{GAME_TYPE}/{DATE}/data.parquet"
staging_lineups_path = f"s3://{bucket_name}/staging/{SPORT}/lineups/{GAME_TYPE}/{DATE}/data.json.gz"

In [23]:
# First, get all unique position keys from the lineups data
positions_query = f"""
    SELECT DISTINCT
        pos_key.key as position_key
    FROM read_json_auto('{staging_lineups_path}', maximum_object_size={MAX_OBJECT_SIZE}),
        json_each(lineups -> 'lineupPlayers') as pos_key
    ORDER BY position_key
"""

positions_df = con.execute(positions_query).df()
position_keys = positions_df['position_key'].tolist()

# Create dynamic position columns
position_columns = ", ".join([
    f"(lineups -> 'lineupPlayers' ->> '{pos}')::INTEGER as pos_{pos.lower()}"
    for pos in position_keys
])

# Now build the main query with dynamic position columns
main_query = f"""
    SELECT
        slate_id as contest_id,
        (lineups ->> 'lineupHash') as lineup_hash,
        (lineups ->> 'lineupCt')::INTEGER as lineup_ct,
        (lineups ->> 'lineupUserCt')::INTEGER as lineup_user_ct,
        (lineups ->> 'points')::DOUBLE as points,
        (lineups ->> 'totalSalary')::INTEGER as total_salary,
        (lineups ->> 'totalOwn')::DOUBLE as total_own,
        (lineups ->> 'minOwn')::DOUBLE as min_own,
        (lineups ->> 'maxOwn')::DOUBLE as max_own,
        (lineups ->> 'avgOwn')::DOUBLE as avg_own,
        (lineups ->> 'lineupRank')::INTEGER as lineup_rank,
        (lineups ->> 'isCashing')::BOOLEAN as is_cashing,
        (lineups ->> 'favoriteCt')::INTEGER as favorite_ct,
        (lineups ->> 'underdogCt')::INTEGER as underdog_ct,
        (lineups ->> 'homeCt')::INTEGER as home_ct,
        (lineups ->> 'visitorCt')::INTEGER as visitor_ct,
        (lineups ->> 'payout')::DOUBLE as payout,
        (lineups ->> 'lineupPercentile')::DOUBLE as lineup_percentile,
        (lineups ->> 'correlatedPlayers')::INTEGER as correlated_players,

        -- Dynamic position columns
        {position_columns},

        -- Keep complex objects as JSON strings
        lineups -> 'teamStacks' as team_stacks,
        lineups -> 'gameStacks' as game_stacks,
        lineups -> 'lineupTrends' as lineup_trends,
        lineups -> 'entryNameList' as entry_name_list

    FROM read_json_auto('{staging_lineups_path}', maximum_object_size={MAX_OBJECT_SIZE})
"""

result_df = con.execute(main_query).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [24]:
result_df.head(10)

Unnamed: 0,contest_id,lineup_hash,lineup_ct,lineup_user_ct,points,total_salary,total_own,min_own,max_own,avg_own,...,pos_cpt1,pos_flex1,pos_flex2,pos_flex3,pos_flex4,pos_flex5,team_stacks,game_stacks,lineup_trends,entry_name_list
0,182808315,119535:33213:6957:40377:33513:52089,8,8,166.91,49100,214.41,0.0,77.34,35.74,...,119535,33213,6957,40377,33513,52089,"{""2203"":[""119535:1"",""6957:0""],""1933"":[""33213:0...","{""7947310"":[119535,33213,6957,40377,33513,52089]}","{""qbPairedWithPassCatcher"":false,""qbStackPaire...","[""gfriedmann"",""mrcse13"",""IrishHighlandFarmer"",..."
1,182808315,6957:33213:119535:40377:33513:52089,7,7,165.64,49800,208.19,0.0,77.34,34.7,...,6957,33213,119535,40377,33513,52089,"{""2203"":[""6957:1"",""119535:0""],""1933"":[""33213:0...","{""7947310"":[6957,33213,119535,40377,33513,52089]}","{""qbPairedWithPassCatcher"":false,""qbStackPaire...","[""Krediarj2"",""dahladino"",""atlupo24"",""jbowden91..."
2,182808315,119535:6957:63509:40377:33513:52089,1,1,164.19,46900,165.71,0.0,57.01,27.62,...,119535,6957,63509,40377,33513,52089,"{""2203"":[""119535:1"",""6957:0""],""1933"":[""63509:0...","{""7947310"":[119535,6957,63509,40377,33513,52089]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""king.h""]"
3,182808315,119535:93290:6957:40377:33513:52089,31,31,163.51,49300,218.47,0.0,81.4,36.41,...,119535,93290,6957,40377,33513,52089,"{""2203"":[""119535:1"",""93290:0"",""6957:0""],""1933""...","{""7947310"":[119535,93290,6957,40377,33513,52089]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""Kenwalk"",""JustDraftMe"",""Comeonman64"",""vegask..."
4,182808315,119535:33213:63509:40377:33513:52089,8,8,162.53,47900,186.04,0.0,77.34,31.01,...,119535,33213,63509,40377,33513,52089,"{""2203"":null,""1933"":[""33213:0"",""63509:0"",""4037...","{""7947310"":[119535,33213,63509,40377,33513,520...","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""ddry"",""jsf900"",""Zeechamp"",""Sackreligious9"",""..."
5,182808315,6957:93290:119535:40377:33513:52089,22,22,162.24,50000,212.25,0.0,81.4,35.38,...,6957,93290,119535,40377,33513,52089,"{""2203"":[""6957:1"",""93290:0"",""119535:0""],""1933""...","{""7947310"":[6957,93290,119535,40377,33513,52089]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""bericbehm2"",""plpdig"",""phillyboy8150"",""MikeUd..."
6,182808315,119535:33213:6957:63509:33513:170588,7,7,161.09,49300,197.39,0.0,77.34,32.9,...,119535,33213,6957,63509,33513,170588,"{""2203"":[""119535:1"",""6957:0"",""170588:0""],""1933...","{""7947310"":[119535,33213,6957,63509,33513,1705...","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""Sackreligious9"",""Grex123"",""walkandel"",""lawye..."
7,182808315,33513:93290:6957:119535:40377:52089,2,2,161.06,47100,236.12,0.0,81.4,39.35,...,33513,93290,6957,119535,40377,52089,"{""2203"":[""93290:0"",""6957:0"",""119535:0""],""1933""...","{""7947310"":[33513,93290,6957,119535,40377,52089]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""Vernthecar"",""bakebomb""]"
8,182808315,33513:33213:6957:63509:119535:91547,5,5,160.84,47900,223.21,0.0,77.34,37.2,...,33513,33213,6957,63509,119535,91547,"{""2203"":[""6957:0"",""119535:0"",""91547:0""],""1933""...","{""7947310"":[33513,33213,6957,63509,119535,91547]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""Zeechamp"",""mau5pad"",""cantfademe"",""ndizzle77""..."
9,182808315,33513:33213:6957:63509:119535:4337,2,2,160.54,48900,219.89,0.0,77.34,36.65,...,33513,33213,6957,63509,119535,4337,"{""2203"":[""6957:0"",""119535:0""],""1933"":[""33513:1...","{""7947310"":[33513,33213,6957,63509,119535,4337]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""spida2375"",""jaxonfive""]"


In [25]:
result_df['contest_id'].drop_duplicates()

0    182808315
Name: contest_id, dtype: int64