In [1]:
import duckdb
import os

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

print("✓ DuckDB configured with S3 credentials")
print(f"Endpoint: {wasabi_endpoint}")
print('Bucket Name: ', bucket_name, '\n\n')


✓ DuckDB configured with S3 credentials
Endpoint: s3.us-east-2.wasabisys.com
Bucket Name:  dfscrunch-data-lake 




In [2]:
sport = "NFL"
lineups_dds_path = f"s3://{bucket_name}/dds/{sport}/lineups/dk_single_game/*/data.parquet"

In [10]:
lineups_df = con.execute(
    f"""
    SELECT * FROM parquet_scan('{lineups_dds_path}') limit 10;
    """
).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [11]:
lineups_df

Unnamed: 0,contest_id,lineup_hash,lineup_ct,lineup_user_ct,points,total_salary,total_own,min_own,max_own,avg_own,...,pos_cpt1,pos_flex1,pos_flex2,pos_flex3,pos_flex4,pos_flex5,team_stacks,game_stacks,lineup_trends,entry_name_list
0,167757150,47519:46269:63612:122689:2952:8493,4,4,94.54,47400,159.61,0.0,60.69,26.60,...,47519,46269,63612,122689,2952,8493,"{""1965"":[""47519:1"",""122689:0"",""2952:0"",""8493:0...","{""6223120"":[47519,46269,63612,122689,2952,8493]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""oxenduck"",""EmileHeskey"",""hishboo"",""Mauk87""]"
1,167757150,47519:46269:63612:122689:2952:8493,4,4,94.54,47400,159.61,0.0,60.69,26.60,...,47519,46269,63612,122689,2952,8493,"{""1965"":[""47519:1"",""122689:0"",""2952:0"",""8493:0...","{""6223120"":[47519,46269,63612,122689,2952,8493]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""oxenduck"",""EmileHeskey"",""hishboo"",""Mauk87""]"
2,167757109,2952:146167:47519:46269:122689:8493,1,1,95.44,47500,239.69,0.0,81.78,39.95,...,2952,146167,47519,46269,122689,8493,"{""1965"":[""2952:1"",""47519:0"",""122689:0"",""8493:0...","{""6223120"":[2952,146167,47519,46269,122689,8493]}","{""qbPairedWithPassCatcher"":false,""qbStackPaire...","[""Lilsoupy""]"
3,167757109,47519:46269:63612:122689:2952:8493,2,2,94.54,47400,154.67,0.0,55.83,25.78,...,47519,46269,63612,122689,2952,8493,"{""1965"":[""47519:1"",""122689:0"",""2952:0"",""8493:0...","{""6223120"":[47519,46269,63612,122689,2952,8493]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""oxenduck"",""hishboo""]"
4,167757166,33516:61444:29304:6816:134955:33718,14,14,145.78,50000,194.40,0.0,58.59,32.40,...,33516,61444,29304,6816,134955,33718,"{""1965"":null,""1990"":null}","{""6223120"":null}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""oxenduck"",""justintime29"",""luckychuck"",""fours..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,166872314,146167:25839:58170:55222:8466:33456,1,1,119.25,48000,160.31,0.0,74.11,26.72,...,146167,25839,58170,55222,8466,33456,"{""2207"":[""58170:0"",""55222:0"",""8466:0""],""1990"":...","{""6212000"":[146167,25839,58170,55222,8466,33456]}","{""qbPairedWithPassCatcher"":false,""qbStackPaire...","[""SoyPapa""]"
301,166872314,58170:58465:146167:63612:8466:33456,2,2,119.81,49500,178.27,0.0,83.10,29.71,...,58170,58465,146167,63612,8466,33456,"{""2207"":[""58170:1"",""58465:0"",""8466:0""],""1990"":...","{""6212000"":[58170,58465,146167,63612,8466,33456]}","{""qbPairedWithPassCatcher"":false,""qbStackPaire...","[""bcplumbing"",""Flip2108""]"
302,166872315,146167:58465:58170:55222:61795:8466,61,61,119.11,49400,169.52,0.0,80.58,28.25,...,146167,58465,58170,55222,61795,8466,"{""2207"":[""58465:0"",""58170:0"",""55222:0"",""61795:...","{""6212000"":[146167,58465,58170,55222,61795,8466]}","{""qbPairedWithPassCatcher"":true,""qbStackPaired...","[""pgk8"",""JRucker504"",""heart.chakra"",""HChambers..."
303,166872315,146167:58465:58170:63612:8466:33456,4,4,120.31,49800,180.51,0.0,80.58,30.08,...,146167,58465,58170,63612,8466,33456,"{""2207"":[""58465:0"",""58170:0"",""8466:0""],""1990"":...","{""6212000"":[146167,58465,58170,63612,8466,33456]}","{""qbPairedWithPassCatcher"":false,""qbStackPaire...","[""Skreat"",""aznbeerman"",""DaSnipers"",""Schilly23""]"


In [12]:
contests_path = f"s3://{bucket_name}/dds/{sport}/contests/dk_single_game/*/data.parquet"

contest_id            305
lineup_hash           305
lineup_ct             305
lineup_user_ct        305
points                305
total_salary          305
total_own             305
min_own               305
max_own               305
avg_own               305
lineup_rank           305
is_cashing            305
favorite_ct           305
underdog_ct           305
home_ct               305
visitor_ct            305
payout                305
lineup_percentile     305
correlated_players    305
pos_cpt1              305
pos_flex1             305
pos_flex2             305
pos_flex3             305
pos_flex4             305
pos_flex5             305
team_stacks           305
game_stacks           305
lineup_trends         305
entry_name_list       305
dtype: int64

In [25]:
contests_df = con.execute(
    f"""
        SELECT
            contest_id,
            ceil(contest_size * 0.01) as top_1_percent_max_position,
         FROM parquet_scan('{contests_path}') limit 10;
    """
).df()

In [26]:
contests_df

Unnamed: 0,contest_id,top_1_percent_max_position
0,164284864,1.0
1,164284865,2.0
2,164284866,1.0
3,164284880,3.0
4,164284868,26.0
5,164284869,53.0
6,164284870,1961.0
7,166617480,118.0
8,164284862,50.0
9,164284893,298.0


In [18]:
players_path = f"s3://{bucket_name}/dds/{sport}/players/dk_single_game/*/data.parquet"
top_lineups_df = con.execute(
    f"""
        WITH
            lineups AS (
                SELECT
                    replace(list_element(string_split(filename, '/'), -2), '-', '')::integer as date_id,
                    contest_id,
                    lineup_rank,
                    total_own,
                    total_salary,
                    pos_cpt1,
                    pos_flex1,
                    pos_flex2,
                    pos_flex3,
                    pos_flex4,
                    pos_flex5
                FROM parquet_scan('{lineups_dds_path}')
                WHERE lineup_percentile <= 0.01
            ),
            -- Unpivot lineup positions to enable single join
            lineup_positions AS (
                SELECT
                    date_id,
                    contest_id,
                    lineup_rank,
                    total_own,
                    total_salary,
                    unnest(['cpt1', 'flex1', 'flex2', 'flex3', 'flex4', 'flex5']) as pos_slot,
                    unnest([pos_cpt1, pos_flex1, pos_flex2, pos_flex3, pos_flex4, pos_flex5]) as player_id
                FROM lineups
            ),
            -- Pre-filter to only load players that appear in top lineups
            needed_players AS (
                SELECT DISTINCT date_id, player_id
                FROM lineup_positions
            ),
            -- Load only the players we need, with team and salary info
            players AS (
                SELECT
                    replace(list_element(string_split(filename, '/'), -2), '-', '')::integer as date_id,
                    p.player_id,
                    p.position,
                    p.current_team,
                    p.salary,
                    p.roster_position
                FROM parquet_scan('{players_path}') p
                WHERE (replace(list_element(string_split(filename, '/'), -2), '-', '')::integer, p.player_id) IN 
                      (SELECT date_id, player_id FROM needed_players)
            )
            -- Single join with roster position matching and pivot back to wide format
            SELECT
                contest_id,
                lineup_rank,
                max(total_own) as total_own,
                max(total_salary) as total_salary,
                max(CASE WHEN pos_slot = 'cpt1' THEN position END) as pos_cpt1_real,
                max(CASE WHEN pos_slot = 'cpt1' THEN current_team END) as pos_cpt1_team,
                max(CASE WHEN pos_slot = 'cpt1' THEN salary END) as pos_cpt1_salary,
                max(CASE WHEN pos_slot = 'flex1' THEN position END) as pos_flex1_real,
                max(CASE WHEN pos_slot = 'flex1' THEN current_team END) as pos_flex1_team,
                max(CASE WHEN pos_slot = 'flex1' THEN salary END) as pos_flex1_salary,
                max(CASE WHEN pos_slot = 'flex2' THEN position END) as pos_flex2_real,
                max(CASE WHEN pos_slot = 'flex2' THEN current_team END) as pos_flex2_team,
                max(CASE WHEN pos_slot = 'flex2' THEN salary END) as pos_flex2_salary,
                max(CASE WHEN pos_slot = 'flex3' THEN position END) as pos_flex3_real,
                max(CASE WHEN pos_slot = 'flex3' THEN current_team END) as pos_flex3_team,
                max(CASE WHEN pos_slot = 'flex3' THEN salary END) as pos_flex3_salary,
                max(CASE WHEN pos_slot = 'flex4' THEN position END) as pos_flex4_real,
                max(CASE WHEN pos_slot = 'flex4' THEN current_team END) as pos_flex4_team,
                max(CASE WHEN pos_slot = 'flex4' THEN salary END) as pos_flex4_salary,
                max(CASE WHEN pos_slot = 'flex5' THEN position END) as pos_flex5_real,
                max(CASE WHEN pos_slot = 'flex5' THEN current_team END) as pos_flex5_team,
                max(CASE WHEN pos_slot = 'flex5' THEN salary END) as pos_flex5_salary
            FROM lineup_positions lp
                JOIN players p ON lp.player_id = p.player_id 
                    AND lp.date_id = p.date_id
                    AND ((lp.pos_slot = 'cpt1' AND p.roster_position = 'CPT') 
                        OR (lp.pos_slot != 'cpt1' AND p.roster_position != 'CPT'))
            GROUP BY contest_id, lineup_rank
    """
).df()

In [19]:
top_lineups_df

Unnamed: 0,contest_id,lineup_rank,total_own,total_salary,pos_cpt1_real,pos_cpt1_team,pos_cpt1_salary,pos_flex1_real,pos_flex1_team,pos_flex1_salary,...,pos_flex2_salary,pos_flex3_real,pos_flex3_team,pos_flex3_salary,pos_flex4_real,pos_flex4_team,pos_flex4_salary,pos_flex5_real,pos_flex5_team,pos_flex5_salary
0,167604243,1,201.28,48600,WR,NYG,16200,WR,DAL,11800,...,6400,K,DAL,5000,WR,NYG,4800,K,NYG,4400
1,167604256,27,184.41,48600,WR,NYG,16200,WR,DAL,11800,...,6400,K,DAL,5000,WR,NYG,4800,K,NYG,4400
2,164284870,14,255.74,49100,WR,KC,8700,QB,BAL,11400,...,10400,RB,KC,8400,WR,KC,7600,TE,BAL,2600
3,181656897,1,212.07,49500,WR,BUF,9900,QB,BUF,11000,...,10200,WR,BAL,8200,TE,BUF,5600,K,BAL,4600
4,181976084,11,132.37,48000,RB,ATL,18000,WR,MIN,11800,...,7000,K,ATL,4600,D,ATL,3600,RB,ATL,3000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,166661327,17,252.26,39200,WR,NYJ,3000,WR,NYJ,9800,...,9600,QB,NYJ,9000,K,SF,4800,RB,SF,3000
285,166661329,17,176.74,39600,WR,SF,14400,RB,NYJ,10200,...,5200,K,SF,4800,RB,SF,3000,WR,NYJ,2000
286,166661329,5,222.32,40100,RB,SF,4500,RB,NYJ,10200,...,9600,QB,NYJ,9000,K,SF,4800,WR,NYJ,2000
287,166661330,7,262.97,41200,K,SF,7200,RB,NYJ,10200,...,9600,QB,SF,9200,RB,SF,3000,WR,NYJ,2000


In [17]:
players_path = f"s3://{bucket_name}/dds/{sport}/players/dk_single_game/*/data.parquet"
players_df = con.execute(
    f"""
        select * from parquet_scan('{players_path}');
    """
).df()

Unnamed: 0,contest_id,lineup_rank,total_own,total_salary,pos_cpt1_real,pos_flex1_real,pos_flex2_real,pos_flex3_real,pos_flex4_real,pos_flex5_real
0,167598784,1,140.79,50000,QB,RB,TE,WR,D,RB
1,181656897,1,212.07,49500,WR,QB,RB,WR,TE,K
2,167604249,2,251.60,49700,K,WR,WR,QB,WR,K
3,167604243,1,201.28,48600,WR,WR,RB,K,WR,K
4,167604256,27,184.41,48600,WR,WR,RB,K,WR,K
...,...,...,...,...,...,...,...,...,...,...
732,166661329,5,222.32,40100,RB,RB,WR,QB,K,WR
733,166661330,86,171.81,31300,RB,RB,QB,K,WR,FB
734,166661330,48,261.61,39200,WR,RB,QB,QB,K,RB
735,166661330,7,262.97,41200,K,RB,WR,QB,RB,WR


In [34]:
players_df

Unnamed: 0,player_key,player_id,first_name,last_name,full_name,salary,position,roster_position,current_team,current_team_id,event_id,event_team_id,home_visitor,fav_dog,proj_points,ownership,actual_points,stat_details,made_cut
0,265928:1,265928,Izaiah,Gathings,Izaiah Gathings,300,WR,CPT,,2102,6209760,62097602,Home,Favorite,0.00,0.00,0.0,,0
1,170960:0,170960,Xavier,Worthy,Xavier Worthy,5800,WR,WR,KC,2102,6209760,62097602,Home,Favorite,10.91,39.03,20.8,"1 RuTD, 1 RecTD, 21 RuYds, 47 RecYds, 2 Rec,",0
2,167855:0,167855,Carson,Steele,Carson Steele,200,FB,RB,KC,2102,6209760,62097602,Home,Favorite,2.16,5.12,0.3,"3 RuYds,",0
3,148325:1,148325,Rasheen,Ali,Rasheen Ali,300,RB,CPT,BAL,2170,6209760,62097601,Visitor,Dog,0.00,0.00,0.0,,0
4,148325:0,148325,Rasheen,Ali,Rasheen Ali,200,RB,RB,BAL,2170,6209760,62097601,Visitor,Dog,0.00,0.03,0.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10710,40643:0,40643,Mason,Rudolph,Mason Rudolph,6000,QB,QB,PIT,2198,7943380,79433802,Home,Dog,0.00,0.00,0.0,,0
10711,49557:1,49557,Jake,Ferguson,Jake Ferguson,12600,TE,CPT,DAL,2237,7949260,79492602,Home,Dog,29.73,0.00,25.5,"1 RecTD, 40 RecYds, 7 Rec,",0
10712,45261:0,45261,Josh,Whyle,Josh Whyle,200,TE,TE,GB,2062,7949260,79492601,Visitor,Favorite,0.00,0.00,0.0,,0
10713,8471:0,8471,Dallas,Defense,Dallas Defense,3400,D,D,DAL,2237,7949260,79492602,Home,Dog,5.14,5.04,1.0,"1 SACK, 1 DFR, 1 BLK, 35+ Points Allowed",0
