In [21]:
import duckdb
import os

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

<_duckdb.DuckDBPyConnection at 0x7f811cbc70f0>

In [22]:
sport = 'NFL'
dds_contests_path = f"s3://{bucket_name}/dds/{sport}/contests/*/*/data.parquet"

In [23]:
lineups_path = f"s3://{bucket_name}/dds/{sport}/lineups/*/*/data.parquet"

In [24]:
draft_groups_path = f"s3://{bucket_name}/dds/{sport}/draft_groups/*/*/data.parquet"

In [29]:
merged_df = con.execute(
    f"""
    with contests as (
    select contest_id,
        contest_group_id as draft_group_id
     from read_parquet('{dds_contests_path}') where is_largest_by_size = TRUE
    ),
    draft_groups as (
    select draft_group_id,
        draft_group_reference_id as slate_id
     from read_parquet('{draft_groups_path}')
    ),
    lineups as (
        select
        contest_id,
        min(points) as points,
        max(lineup_rank) as max_lineup_rank
        from read_parquet('{lineups_path}')
            where is_cashing = TRUE
            group by contest_id
    )
    select distinct
        draft_groups.slate_id,
        contests.contest_id,
        points,
        max_lineup_rank
        from contests
            left join lineups on contests.contest_id = lineups.contest_id
                join draft_groups on contests.draft_group_id = draft_groups.draft_group_id
"""
).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [30]:
merged_df.drop_duplicates()

Unnamed: 0,slate_id,contest_id,points,max_lineup_rank
0,98583,156962691,138.96,400
1,117082,170261728,126.60,289
2,119026,171974427,102.52,12377
3,115177,168691867,78.49,24863
4,113855,167385247,94.78,21815
...,...,...,...,...
1184,99722,158049896,104.40,220
1185,95356,154604968,148.56,97200
1186,96907,155633497,86.29,21734
1187,89525,147325139,144.36,5620


In [12]:
contests_df = con.execute(f"""
    select * from read_parquet('{dds_contests_path}') limit 10
""")

In [14]:
contests_df.df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,contest_id,contest_name,contest_size,entry_cost,total_prizes,multi_entry_max,is_largest_by_size,is_primary,tour,source_id,sport_event_id,contest_group_id,cash_line,date_id
0,149435843,NFL $100K Screen Pass [$25K to 1st] (Thu-Mon),7843,15,100000,150,True,False,,4,0,96716,1710,20230907
1,147325137,NFL $3M MEGA Millionaire [$1M to 1st + ToC Sem...,768,4444,3072667,23,True,True,,4,0,95302,145,20230910
2,150098355,NFL $201K Luxury Box [$75K to 1st],67,3180,201000,2,False,False,,4,0,95302,13,20230910
3,147325161,"NFL $350K Game Changer [$100K to 1st, Single E...",247,1500,350000,1,False,False,,4,0,95302,55,20230910
4,147325138,NFL $750K Wildcat [$150K to 1st],2502,333,750000,75,False,False,,4,0,95302,541,20230910
5,147325090,"NFL $600K Power Sweep [3 Entry Max, $100K to 1st]",4444,150,600000,3,False,False,,4,0,95302,1005,20230910
6,147325139,NFL $2.5M Fantasy Football Millionaire [$1M to...,28029,100,2522667,150,True,True,,4,0,95302,5620,20230910
7,147325101,"NFL $300K End Zone [$50K to 1st, Single Entry]",4545,75,300000,1,False,False,,4,0,95302,1090,20230910
8,147325103,"NFL $200K Red Zone [$25K to 1st, Single Entry]",4545,50,200000,1,False,False,,4,0,95302,999,20230910
9,147325076,NFL $500K Bootleg [$100K to 1st],17669,33,500000,150,False,False,,4,0,95302,4100,20230910


In [15]:
lineups_df = con.execute(f"""select * from read_parquet('{lineups_path}') limit 10""").df()
lineups_df

InvalidInputException: Invalid Input Error: Failed to read file "s3://dfscrunch-data-lake/dds/NFL/lineups/dk_classic/2023-10-05/data.parquet": schema mismatch in glob: column "pos_dst1" was read from the original file "s3://dfscrunch-data-lake/dds/NFL/lineups/dk_classic/2023-09-07/data.parquet", but could not be found in file "s3://dfscrunch-data-lake/dds/NFL/lineups/dk_classic/2023-10-05/data.parquet".
Candidate names: contest_id, lineup_hash, lineup_ct, lineup_user_ct, points, total_salary, total_own, min_own, max_own, avg_own, lineup_rank, is_cashing, favorite_ct, underdog_ct, home_ct, visitor_ct, payout, lineup_percentile, correlated_players, pos_qb1, pos_rb1, pos_rb2, pos_te1, pos_wr1, pos_wr2, pos_wr3, team_stacks, game_stacks, lineup_trends, entry_name_list
If you are trying to read files with different schemas, try setting union_by_name=True