In [92]:
import duckdb
import os

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

print("✓ DuckDB configured with S3 credentials")
print(f"Endpoint: {wasabi_endpoint}")
print(f"Access Key ID: {wasabi_access_key}")
print(f"Secret Access Key: {wasabi_secret_key}")
print('Bucket Name: ', bucket_name, '\n\n')


✓ DuckDB configured with S3 credentials
Endpoint: s3.us-east-2.wasabisys.com
Access Key ID: 60DLEZ81XUCI9ZNG8M3F
Secret Access Key: PuOySWLzPCtuts4IfXzWzdHRBefvGKgD0laHOb4B
Bucket Name:  dfscrunch-data-lake 




In [117]:
players_path = f's3://{bucket_name}/staging/NHL/moneypuck/players/*/data.parquet'
df = con.execute(f"""
with players as (
  select
    playerName,
    team,
    I_F_iceTime,
    iceTimeRank,
    position,
    situation,
    split_part(filename, '/', 8) as game_date
    from read_parquet('{players_path}', filename=true)
    where situation = '5on5' and position = any(array['line', 'pairing', 'G'])
),
lines_players_raw as (
  select
    *,
    playerName.split('-') as last_names,
    'f' || iceTimeRank::string as line,
    from players
      where position = 'line' and iceTimeRank <= 4
        order by iceTimeRank asc
),
lines_players as (
  SELECT
    unnest(last_names) as last_name,
    * from lines_players_raw
),
defense_players_raw as (
  select
    *,
    playerName.split('-') as last_names,
    'd' || iceTimeRank::string as line,
  from players
    where position = 'pairing' and iceTimeRank <= 3
      order by iceTimeRank asc
),
defense_players as (
  SELECT
  unnest(last_names) as last_name,
  * from defense_players_raw
),
goalie_players as (
  select
   *,
   playerName as player_name,
   'G1' as line,
   from players
  where position = 'G' and iceTimeRank = -1
  order by I_F_iceTime asc
)
SELECT
  game_date,
  trim(last_name) as last_name,
  team,
  line,
  from lines_players
  union
   select
   game_date,
    trim(last_name) as last_name,
    team,
  line,
   from defense_players
  union select
   game_date,
    trim(playerName) as last_name,
    team,
  line,
    from goalie_players
""").df()

In [118]:
df.head()

Unnamed: 0,game_date,last_name,team,line
0,2025-10-18,Devin Cooley,CGY,G1
1,2025-10-24,Eric Comrie,WPG,G1
2,2025-10-13,Juuse Saros,NSH,G1
3,2025-10-20,Connor Hellebuyck,WPG,G1
4,2025-10-28,Ilya Sorokin,NYI,G1


In [82]:
list(df.columns)


['playerName',
 'team',
 'I_F_iceTime',
 'iceTimeRank',
 'position',
 'situation',
 'game_date',
 'line']

In [83]:
df.position.drop_duplicates()

0           G
21    pairing
75       line
Name: position, dtype: object

In [84]:
df[['playerName', 'team', 'I_F_iceTime', 'iceTimeRank']]

Unnamed: 0,playerName,team,I_F_iceTime,iceTimeRank
0,Philipp Grubauer,SEA,917.0,-1
1,Trent Miner,COL,1730.0,-1
2,Scott Wedgewood,COL,2537.0,-1
3,Thatcher Demko,VAN,2602.0,-1
4,Cam Talbot,DET,2614.0,-1
...,...,...,...,...
2660,Johnston-Poehling-Nesterenko,ANA,282.0,4
2661,Gatcomb-Cizikas-Tsyplakov,NYI,464.0,4
2662,Holloway-Sundqvist-Bjugstad,STL,304.0,4
2663,Carcone-Mcbain-Crouse,UTA,249.0,4
