In [2]:
import duckdb
import os

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

print("✓ DuckDB configured with S3 credentials")
print(f"Endpoint: {wasabi_endpoint}")
print(f"Access Key ID: {wasabi_access_key}")
print(f"Secret Access Key: {wasabi_secret_key}")
print('Bucket Name: ', bucket_name, '\n\n')

✓ DuckDB configured with S3 credentials
Endpoint: s3.us-east-2.wasabisys.com
Access Key ID: 60DLEZ81XUCI9ZNG8M3F
Secret Access Key: PuOySWLzPCtuts4IfXzWzdHRBefvGKgD0laHOb4B
Bucket Name:  dfscrunch-data-lake 




In [13]:
sport = 'NFL'
dds_users_path = f"s3://{bucket_name}/staging/{sport}/contests/dk_classic/2025-09-07/data.json.gz"
df = con.execute(
    f"""
      select
            (contest_element ->> 'contest_id')::INTEGER as contest_id,
            contest_element ->> 'contest_name' as contest_name,
            (contest_element ->> 'contest_size')::INTEGER as contest_size,
            (contest_element ->> 'entry_cost')::INTEGER as entry_cost,
            (contest_element ->> 'total_prizes')::INTEGER as total_prizes,
            (contest_element ->> 'multi_entry_max')::INTEGER as multi_entry_max,
            (contest_element ->> 'is_largest_by_size')::BOOLEAN as is_largest_by_size,
            (contest_element ->> 'is_primary')::BOOLEAN as is_primary,
            contest_element ->> 'tour' as tour,
            (contest_element ->> 'source_id')::INTEGER as source_id,
            (contest_element ->> 'sport_event_id')::INTEGER as sport_event_id,
            (contest_element ->> 'contest_group_id')::INTEGER as contest_group_id,
            (contest_element ->> 'cash_line')::INTEGER as cash_line,
            (contest_element ->> 'date_id')::INTEGER as date_id
      from (
        select
            unnest(live_contests) as contest_element
        from read_json_auto('{dds_users_path}')
        );
    """
).df()


In [14]:
df.head()

Unnamed: 0,contest_id,contest_name,contest_size,entry_cost,total_prizes,multi_entry_max,is_largest_by_size,is_primary,tour,source_id,sport_event_id,contest_group_id,cash_line,date_id
0,179164193,NFL $150K Thunderdome [Single Entry],30,5300,150000,1,False,False,,4,0,124693,7,20250907
1,179164209,NFL $3.5M MEGA Millionaire [$1M to 1st],875,4444,3500000,26,True,True,,4,0,124693,164,20250907
2,179164251,"NFL $400K Game Changer [$100K to 1st, Single E...",282,1500,400000,1,False,False,,4,0,124693,62,20250907
3,179164239,"NFL $200K Facemask [3 Entry Max, $50K to 1st]",400,555,200000,3,False,False,,4,0,124693,88,20250907
4,179164212,NFL $750K Wildcat [$200K to 1st],2502,333,750000,75,False,False,,4,0,124693,524,20250907


In [20]:
sport = 'NFL'
contest_df = con.execute(
    f"""
      SELECT
          list_element(string_split(filename, '/'), -3) as slate_type,
          contest_id,
          contest_name,
          contest_size,
          entry_cost,
          total_prizes,
          multi_entry_max,
          is_largest_by_size,
          is_primary,
          tour,
          source_id,
          sport_event_id,
          contest_group_id,
          cash_line,
          date_id
      FROM read_parquet('s3://{bucket_name}/dds/{sport}/contests/*/*/data.parquet')
    """
).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [21]:
set(contest_df['contest_size'])


{8,
 10,
 12,
 16,
 30,
 36,
 43,
 53,
 67,
 70,
 100,
 134,
 138,
 167,
 170,
 175,
 200,
 234,
 247,
 250,
 259,
 282,
 300,
 318,
 337,
 350,
 375,
 400,
 500,
 634,
 667,
 687,
 750,
 875,
 1000,
 1001,
 1111,
 1501,
 1668,
 2002,
 2335,
 2502,
 3137,
 3239,
 3529,
 3533,
 3703,
 3787,
 3921,
 4170,
 4319,
 4444,
 4545,
 4597,
 4705,
 4901,
 5005,
 5300,
 5505,
 5681,
 5747,
 5882,
 6006,
 7067,
 7352,
 7843,
 8823,
 9803,
 11764,
 11890,
 13071,
 13793,
 14135,
 14705,
 15686,
 16646,
 17156,
 17669,
 17835,
 19024,
 19607,
 19817,
 23529,
 23781,
 26143,
 27777,
 29726,
 31372,
 32679,
 35671,
 39215,
 39635,
 47058,
 47562,
 58823,
 59453,
 71343,
 79270,
 87145,
 88235,
 89179,
 98039,
 99088,
 117647,
 118906,
 132352,
 138723,
 142687,
 148632,
 158541,
 161764,
 176470,
 178359,
 190249,
 191176,
 196078,
 198176,
 205882,
 208085,
 237812,
 297265,
 317082,
 352313,
 356718,
 396353,
 832342}

In [24]:
sport = 'NFL'
dds_contests_path = f"s3://{bucket_name}/dds/{sport}/contests/*/*/data.parquet"
contests_grouped = con.execute(
    f"""
    WITH contests_with_buckets AS (
      SELECT
          list_element(string_split(filename, '/'), -3) as slate_type,
          contest_id,
          contest_name,
          contest_size,
          NTILE(5) OVER (ORDER BY contest_size) as size_bucket
      FROM read_parquet('{dds_contests_path}')
  )
  SELECT
      size_bucket,
      COUNT(*) as contest_count,
      MIN(contest_size) as min_size,
      MAX(contest_size) as max_size,
      ROUND(AVG(contest_size), 0) as avg_size
  FROM contests_with_buckets
  GROUP BY size_bucket
  ORDER BY size_bucket
    """
).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [26]:
contests_grouped

Unnamed: 0,size_bucket,contest_count,min_size,max_size,avg_size
0,1,221,8,318,142.0
1,2,221,318,4444,1868.0
2,3,221,4444,16646,8704.0
3,4,221,17156,79270,37842.0
4,5,221,79270,832342,170987.0


In [35]:
sport = 'NFL'
dds_users_path = f"s3://{bucket_name}/dds/{sport}/users/*/*/data.parquet"
dds_contests_path = f"s3://{bucket_name}/dds/{sport}/contests/*/*/data.parquet"
df = con.execute(f"""
                with users as (
                  SELECT
                    contest_id,
                    user_id,
--                     list_element(string_split(filename, '/'), -3) as slate_type,
--                     replace(list_element(string_split(filename, '/'), -2), '-', '')::integer as date_id,
                    total_players,
                    total_rosters,
                    unique_rosters,
                    max_exposure,
                    lineups_cashing,
                    lineups_in_percentile_1,
                    lineups_in_percentile_2,
                    lineups_in_percentile_5,
                    lineups_in_percentile_10,
                    lineups_in_percentile_20,
                    lineups_in_percentile_50,
                    total_entry_cost,
                    total_winning,
                    roi
                  FROM read_parquet('{dds_users_path}')
                ),
                contests as (
                  SELECT
                      list_element(string_split(filename, '/'), -3) as slate_type,
                      contest_id,
--                       contest_name,
--                       contest_size,
--                       entry_cost,
--                       total_prizes,
--                       multi_entry_max,
--                       is_largest_by_size,
--                       is_primary,
--                       tour,
--                       source_id,
--                       sport_event_id,
--                       contest_group_id,
--                       cash_line,
--                       date_id,
                      CASE
                          WHEN contest_size <= 318 THEN '1_Tiny'
                          WHEN contest_size <= 4444 THEN '2_Small'
                          WHEN contest_size <= 16646 THEN '3_Medium'
                          WHEN contest_size <= 79270 THEN '4_Large'
                          ELSE '5_Massive'
                      END as size_category
                  FROM read_parquet('s3://{bucket_name}/dds/{sport}/contests/*/*/data.parquet')
              )
            SELECT
                user_id,
                slate_type,
                size_category,
                ROUND(SUM(unique_rosters) / SUM(total_rosters), 2) as unique_roster_percent,
                avg(max_exposure) as avg_max_exposure,
                sum(lineups_cashing) as total_lineups_cashing,
                sum(lineups_in_percentile_1) as total_lineups_in_percentile_1,
                sum(lineups_in_percentile_2) as total_lineups_in_percentile_2,
                sum(lineups_in_percentile_5) as total_lineups_in_percentile_5,
                sum(lineups_in_percentile_10) as total_lineups_in_percentile_10,
                sum(lineups_in_percentile_20) as total_lineups_in_percentile_20,
                sum(lineups_in_percentile_50) as total_lineups_in_percentile_50,
                sum(total_entry_cost) as total_entry_cost,
                sum(total_winning) as total_winning,
                sum(roi) as roi
            from users u
                join contests c on c.contest_id = u.contest_id
                group by user_id, slate_type, size_category
""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [36]:
df.head()

Unnamed: 0,user_id,slate_type,size_category,unique_roster_percent,avg_max_exposure,total_lineups_cashing,total_lineups_in_percentile_1,total_lineups_in_percentile_2,total_lineups_in_percentile_5,total_lineups_in_percentile_10,total_lineups_in_percentile_20,total_lineups_in_percentile_50,total_entry_cost,total_winning,roi
0,hotandnasty,dk_classic,5_Massive,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,-20.0
1,Dmilicus,dk_classic,4_Large,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,-15.0
2,dustelli,dk_classic,5_Massive,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,-3.0
3,Skrat,dk_classic,4_Large,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,-5.0
4,Fantasy6,dk_classic,4_Large,1.0,100.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,27.0,20.0,-7.0


In [39]:
df.sort_values(by=['total_lineups_in_percentile_5'], ascending=False).head(10)

Unnamed: 0,user_id,slate_type,size_category,unique_roster_percent,avg_max_exposure,total_lineups_cashing,total_lineups_in_percentile_1,total_lineups_in_percentile_2,total_lineups_in_percentile_5,total_lineups_in_percentile_10,total_lineups_in_percentile_20,total_lineups_in_percentile_50,total_entry_cost,total_winning,roi
69116,dfs_ads,dk_classic,4_Large,0.93,62.932,433.0,44.0,68.0,158.0,255.0,393.0,890.0,29351.0,49423.0,20072.0
331601,bkreider,dk_classic,4_Large,0.83,83.6,501.0,41.0,76.0,157.0,261.0,463.0,1005.0,31301.0,48421.0,17120.0
598522,egar2246,dk_classic,5_Massive,0.94,97.190714,407.0,46.0,70.0,146.0,260.0,378.0,641.0,620.0,1027.0,407.0
320769,Zeechamp,dk_single_game,5_Massive,0.13,76.235,371.0,30.0,48.0,133.0,194.0,347.0,815.0,19945.0,51411.0,31466.0
3642,Triple3xJ,dk_classic,5_Massive,0.87,91.157778,357.0,32.0,58.0,123.0,197.0,323.0,653.0,9310.0,9066.0,-244.0
33464,Triple3xJ,dk_classic,4_Large,0.96,96.89,206.0,47.0,78.0,123.0,154.0,197.0,283.0,2229.0,6095.0,3866.0
455084,werewolvesoflondon,dk_single_game,5_Massive,0.03,81.091111,360.0,27.0,54.0,115.0,179.0,335.0,726.0,19560.0,16267.0,-3293.0
614796,needlunchmoney,dk_classic,4_Large,0.84,61.2925,414.0,34.0,60.0,114.0,200.0,372.0,762.0,26910.0,27348.0,438.0
137467,jbundy80,dk_classic,5_Massive,1.0,100.0,165.0,37.0,64.0,113.0,140.0,163.0,189.0,3080.0,9217.0,6137.0
748843,tadesse1234,dk_classic,5_Massive,0.94,91.232143,245.0,43.0,68.0,112.0,168.0,231.0,405.0,1450.0,2384.0,934.0


In [45]:
df[df['user_id'] == 'gutp']

Unnamed: 0,user_id,slate_type,size_category,unique_roster_percent,avg_max_exposure,total_lineups_cashing,total_lineups_in_percentile_1,total_lineups_in_percentile_2,total_lineups_in_percentile_5,total_lineups_in_percentile_10,total_lineups_in_percentile_20,total_lineups_in_percentile_50,total_entry_cost,total_winning,roi
2292,gutp,dk_classic,5_Massive,0.92,59.758182,196.0,9.0,16.0,44.0,79.0,169.0,364.0,640.0,705.5,65.5
26746,gutp,dk_classic,4_Large,0.86,87.445,34.0,0.0,0.0,4.0,12.0,27.0,77.0,851.0,278.0,-573.0
386591,gutp,dk_classic,3_Medium,1.0,95.74,19.0,1.0,1.0,4.0,12.0,17.0,61.0,2115.0,650.0,-1465.0
482237,gutp,dk_single_game,5_Massive,0.07,64.2,188.0,5.0,13.0,39.0,84.0,170.0,465.0,5695.0,2569.0,-3126.0
501543,gutp,dk_single_game,4_Large,0.09,82.147778,91.0,3.0,10.0,35.0,55.0,101.0,217.0,1810.0,708.0,-1102.0
