In [1]:
import duckdb

import os

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

print("✓ DuckDB configured with S3 credentials")
print(f"Endpoint: {wasabi_endpoint}")
print(f"Access Key ID: {wasabi_access_key}")
print(f"Secret Access Key: {wasabi_secret_key}")
print('Bucket Name: ', bucket_name, '\n\n')


✓ DuckDB configured with S3 credentials
Endpoint: s3.us-east-2.wasabisys.com
Access Key ID: 60DLEZ81XUCI9ZNG8M3F
Secret Access Key: PuOySWLzPCtuts4IfXzWzdHRBefvGKgD0laHOb4B
Bucket Name:  dfscrunch-data-lake 




In [2]:
SPORT = "NFL"  # e.g., "NFL", "NBA", etc.
DATE = "2025-10-12"  # Format: YYYY-MM-DD
GAME_TYPE = "dk_classic"  # e.g., "classic", "dk_single_game", etc.

# Maximum JSON object size (default is 16MB, we're setting to 64MB)
MAX_OBJECT_SIZE = 67108864  # 64mb in bytes

# Construct path
lineups_dds_path = f"s3://{bucket_name}/dds/{SPORT}/lineups/{GAME_TYPE}/*/data.parquet"


In [3]:
lineups_df = con.execute(f"""
select * from parquet_scan('{lineups_dds_path}', union_by_name=true) where lineup_percentile <= 0.01;
""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [4]:
lineups_df.head()

Unnamed: 0,contest_id,lineup_hash,lineup_ct,lineup_user_ct,points,total_salary,total_own,min_own,max_own,avg_own,...,pos_rb2,pos_te1,pos_wr1,pos_wr2,pos_wr3,team_stacks,game_stacks,lineup_trends,entry_name_list,pos_flex1
0,150462218,75120:63509:93719:8709:26343:25839:63612:8721,1,1,120.04,46200,259.92,4.54,65.28,32.49,...,8709.0,26343.0,25839.0,63612.0,8721.0,"{""1990"": [""75120:0"", ""8709:0"", ""25839:0"", ""636...","{""5658490"": [75120, 63509, 93719, 8709, 26343,...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""lanean11""]",75120.0
1,150462218,8479:8721:63575:93719:8709:26343:25839:33513,1,1,118.56,45200,273.07,4.54,65.28,34.13,...,8709.0,26343.0,25839.0,33513.0,,"{""1990"": [""8479:0"", ""63575:0"", ""8709:0"", ""2583...","{""5658490"": [8479, 8721, 63575, 93719, 8709, 2...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""quadnemic""]",8721.0
2,150466655,8487:55115:33296:69339:63517:33707:8590:47332:...,1,1,140.72,49000,292.53,2.16,81.64,32.5,...,63517.0,33707.0,8590.0,47332.0,93289.0,"{""2198"": [""8487:0"", ""93289:0""], ""2087"": [""5511...","{""5658510"": [8487, 55115, 33296, 69339, 63517,...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""xrxk030""]",55115.0
3,150466655,8482:8590:33296:63517:136380:33269:7205:62699:...,1,1,141.4,43600,209.52,2.16,37.14,23.28,...,136380.0,33269.0,7205.0,62699.0,93289.0,"{""2026"": [""8482:0"", ""62699:0""], ""2087"": [""8590...","{""5658500"": [8482, 7205, 62699], ""5658510"": [8...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""MasterLaster""]",8590.0
4,147325139,8494:33513:139382:123168:33090:75275:25839:756...,2,2,217.72,49900,126.87,2.39,36.77,14.1,...,33090.0,75275.0,25839.0,75693.0,56170.0,"{""1928"": [""139382:0"", ""56170:0""]}","{""5658260"": [139382, 75693, 56170], ""5658160"":...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""REDCOAT85"", ""hishboo""]",33513.0


In [6]:
lineups_df.columns

Index(['contest_id', 'lineup_hash', 'lineup_ct', 'lineup_user_ct', 'points',
       'total_salary', 'total_own', 'min_own', 'max_own', 'avg_own',
       'lineup_rank', 'is_cashing', 'favorite_ct', 'underdog_ct', 'home_ct',
       'visitor_ct', 'payout', 'lineup_percentile', 'correlated_players',
       'pos_dst1', 'pos_qb1', 'pos_rb1', 'pos_rb2', 'pos_te1', 'pos_wr1',
       'pos_wr2', 'pos_wr3', 'team_stacks', 'game_stacks', 'lineup_trends',
       'entry_name_list', 'pos_flex1'],
      dtype='object')

In [7]:
lineups_df[:20]

Unnamed: 0,contest_id,lineup_hash,lineup_ct,lineup_user_ct,points,total_salary,total_own,min_own,max_own,avg_own,...,pos_rb2,pos_te1,pos_wr1,pos_wr2,pos_wr3,team_stacks,game_stacks,lineup_trends,entry_name_list,pos_flex1
0,150462218,75120:63509:93719:8709:26343:25839:63612:8721,1,1,120.04,46200,259.92,4.54,65.28,32.49,...,8709.0,26343.0,25839.0,63612.0,8721.0,"{""1990"": [""75120:0"", ""8709:0"", ""25839:0"", ""636...","{""5658490"": [75120, 63509, 93719, 8709, 26343,...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""lanean11""]",75120.0
1,150462218,8479:8721:63575:93719:8709:26343:25839:33513,1,1,118.56,45200,273.07,4.54,65.28,34.13,...,8709.0,26343.0,25839.0,33513.0,,"{""1990"": [""8479:0"", ""63575:0"", ""8709:0"", ""2583...","{""5658490"": [8479, 8721, 63575, 93719, 8709, 2...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""quadnemic""]",8721.0
2,150466655,8487:55115:33296:69339:63517:33707:8590:47332:...,1,1,140.72,49000,292.53,2.16,81.64,32.5,...,63517.0,33707.0,8590.0,47332.0,93289.0,"{""2198"": [""8487:0"", ""93289:0""], ""2087"": [""5511...","{""5658510"": [8487, 55115, 33296, 69339, 63517,...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""xrxk030""]",55115.0
3,150466655,8482:8590:33296:63517:136380:33269:7205:62699:...,1,1,141.4,43600,209.52,2.16,37.14,23.28,...,136380.0,33269.0,7205.0,62699.0,93289.0,"{""2026"": [""8482:0"", ""62699:0""], ""2087"": [""8590...","{""5658500"": [8482, 7205, 62699], ""5658510"": [8...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""MasterLaster""]",8590.0
4,147325139,8494:33513:139382:123168:33090:75275:25839:756...,2,2,217.72,49900,126.87,2.39,36.77,14.1,...,33090.0,75275.0,25839.0,75693.0,56170.0,"{""1928"": [""139382:0"", ""56170:0""]}","{""5658260"": [139382, 75693, 56170], ""5658160"":...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""REDCOAT85"", ""hishboo""]",33513.0
5,147325139,8494:61306:63575:33781:63567:75275:64507:25839...,1,1,214.34,50000,141.22,3.54,36.77,15.69,...,63567.0,75275.0,64507.0,25839.0,136555.0,"{""2021"": [""8494:0"", ""63567:0""], ""1990"": [""6357...","{""5658320"": [63575, 33781, 25839]}","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""Bdog98z""]",61306.0
6,147325139,8474:33090:49745:33781:33399:131320:25839:7569...,1,1,213.0,50000,113.15,0.49,30.13,12.57,...,33399.0,131320.0,25839.0,75693.0,53528.0,"{""2062"": [""8474:0"", ""33090:0"", ""49745:0"", ""131...","{""5658320"": [33781, 25839]}","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""bgott1""]",33090.0
7,147325076,8494:33513:139382:123168:33090:75275:25839:756...,1,1,217.72,49900,124.42,2.52,36.43,13.82,...,33090.0,75275.0,25839.0,75693.0,56170.0,"{""1928"": [""139382:0"", ""56170:0""]}","{""5658260"": [139382, 75693, 56170], ""5658160"":...","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""REDCOAT85""]",33513.0
8,147325076,8494:25839:50547:33781:33090:75275:75693:40663...,1,1,209.64,49900,144.98,3.54,36.43,16.11,...,33090.0,75275.0,75693.0,40663.0,61306.0,"{""2097"": [""50547:0"", ""75693:0""]}","{""5658320"": [25839, 33781]}","{""qbPairedWithPassCatcher"": true, ""qbStackPair...","[""beatuphat""]",25839.0
9,147325058,8494:25839:63756:33781:63567:131320:47332:4786...,1,1,206.22,49700,209.4,3.76,74.16,23.27,...,63567.0,131320.0,47332.0,47863.0,136555.0,"{""2021"": [""8494:0"", ""63567:0""], ""2202"": [""6375...","{""5658320"": [25839, 63756, 33781]}","{""qbPairedWithPassCatcher"": false, ""qbStackPai...","[""bevsyla_13""]",25839.0


In [9]:
lineups_df.dropna().count()

contest_id            6068
lineup_hash           6068
lineup_ct             6068
lineup_user_ct        6068
points                6068
total_salary          6068
total_own             6068
min_own               6068
max_own               6068
avg_own               6068
lineup_rank           6068
is_cashing            6068
favorite_ct           6068
underdog_ct           6068
home_ct               6068
visitor_ct            6068
payout                6068
lineup_percentile     6068
correlated_players    6068
pos_dst1              6068
pos_qb1               6068
pos_rb1               6068
pos_rb2               6068
pos_te1               6068
pos_wr1               6068
pos_wr2               6068
pos_wr3               6068
team_stacks           6068
game_stacks           6068
lineup_trends         6068
entry_name_list       6068
pos_flex1             6068
dtype: int64

In [8]:
tmp = con.execute(
    f"""SELECT
                            replace(list_element(string_split(filename, '/'), -2), '-', '')::integer as date_id,
                            contest_id,
                            lineup_rank,
                            total_own,
                            total_salary,
                            pos_dst1,
                            pos_flex1,
                            pos_qb1,
                            pos_rb1,
                            pos_rb2,
                            pos_te1,
                            pos_wr1,
                            pos_wr2,
                            pos_wr3
                        FROM read_parquet('{lineups_dds_path}', filename=true)
                        WHERE lineup_percentile <= 0.01"""
)


In [9]:
tmp.df().head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,date_id,contest_id,lineup_rank,total_own,total_salary,pos_dst1,pos_flex1,pos_qb1,pos_rb1,pos_rb2,pos_te1,pos_wr1,pos_wr2,pos_wr3
0,20251012,183090259,1,135.5,50000,8485,141410,116811,63616,53384,138593,134969,93289,169826
1,20251012,183090259,2,137.63,50000,8485,137548,116811,63616,53384,41195,93289,239536,141410
2,20251012,183090259,3,137.72,50000,8490,63616,26395,135400,53384,41195,93289,239536,141410
3,20251012,183090259,4,95.07,49600,8491,134969,167238,146167,135400,138618,93289,47332,145436
4,20251012,183090259,5,141.23,49800,8491,61444,63756,63616,53384,138618,134969,141410,33513


In [None]:
players_dds_path = f"s3://{bucket_name}/dds/{SPORT}/players/{GAME_TYPE}/{DATE}/data.parquet"
players_df = con.execute(f"select * from parquet_scan('{players_dds_path}');").df()

In [None]:
players_df.head()

In [None]:
con.execute(
    f"""
    with luneups as (
    select
    replace(list_element(string_split(filename, '/'), -2), '-', '')::integer as date_id,
        contest_id,
        lineup_rank,
        pos_cpt1,
        pos_flex1,
        pos_flex2,
        pos_flex3,
        pos_flex4,
        pos_flex5
    from read_parquet('{lineups_dds_path}', filename=true)
    where lineup_percentile <= 0.03
    ),
lineup_positions AS (
                        SELECT
                            date_id,
                            contest_id,
                            lineup_rank,
                            total_own,
                            total_salary,
                            unnest(['cpt1', 'flex1', 'flex2', 'flex3', 'flex4', 'flex5']) as pos_slot,
                            unnest([pos_cpt1, pos_flex1, pos_flex2, pos_flex3, pos_flex4, pos_flex5]) as player_id
                        FROM lineups
                    ),
                    -- Pre-filter to only load players that appear in top lineups
                    needed_players AS (
                        SELECT DISTINCT date_id, player_id
                        FROM lineup_positions
                    ),
                    -- Load only the players we need, with team and salary info
                    players AS (
                        SELECT
                            replace(list_element(string_split(filename, '/'), -2), '-', '')::integer as date_id,
                            p.player_id,
                            p.position,
                            p.current_team,
                            p.salary,
                            p.roster_position
                        FROM read_parquet('{players_path}', filename=true) p
                        WHERE (replace(list_element(string_split(filename, '/'), -2), '-', '')::integer, p.player_id) IN
                              (SELECT date_id, player_id FROM needed_players)
                    )
     """
).df().head()

In [11]:
mart_path = f"s3://{bucket_name}/marts/top_lineups/NFL/dk_classic/data.parquet"
df = con.execute(
    f"""select * from parquet_scan('{mart_path}');"""
)

In [12]:
df.df().head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,contest_id,lineup_rank,total_own,total_salary,pos_dst1_player_id,pos_dst1_real,pos_dst1_team,pos_dst1_salary,pos_flex1_player_id,pos_flex1_real,...,pos_wr1_team,pos_wr1_salary,pos_wr2_player_id,pos_wr2_real,pos_wr2_team,pos_wr2_salary,pos_wr3_player_id,pos_wr3_real,pos_wr3_team,pos_wr3_salary
0,156218281,12,130.4,50000,8484.0,D,NYJ,3100,68220.0,RB,...,HOU,6800,8590.0,WR,CLE,6400,75693.0,WR,JAX,6300
1,156218281,20,148.07,50000,8492.0,D,TB,2600,41195.0,TE,...,CLE,6400,75693.0,WR,JAX,6300,33207.0,WR,TB,6200
2,166821090,16,100.84,50000,8493.0,D,TEN,2700,33207.0,WR,...,DAL,8800,165467.0,WR,NYG,5900,62699.0,WR,NO,4600
3,166821090,11,100.81,49900,8474.0,D,GB,2600,33114.0,RB,...,MIN,8300,53421.0,WR,SF,6800,50284.0,WR,MIN,3900
4,166821090,26,153.3,50000,8488.0,D,LAC,3400,33114.0,RB,...,TB,7500,165467.0,WR,NYG,5900,134955.0,WR,DET,5300
