In [0]:
df = spark.read.table('sleeper.bronze_players')

In [0]:
display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_rosters

In [0]:
from pyspark.sql.functions import current_timestamp, expr, explode

In [0]:
df = spark.read.table("sleeper.bronze_rosters") \
    .withColumn("player_id", explode("players")) \
    .withColumn("is_starter", expr("array_contains(starters, player_id)")) \
    .withColumn("player_nickname", expr("metadata['p_nick_' || player_id]"))

df = df.select("owner_id", "roster_id", "player_id", "is_starter", "player_nickname", "_league_id", "_matchup_week", "_year", "_ingested_ts")\
    .withColumn("_snapshot_ts", current_timestamp())
display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_rosters

In [0]:
df = spark.read.table("sleeper.bronze_rosters") \
    .withColumn("streak", expr("metadata['streak']")) \
    .withColumn("record", expr("metadata['record']"))\
    .withColumn("wins", expr("settings['wins']"))\
    .withColumn("losses", expr("settings['losses']"))\
    .withColumn("ties", expr("settings['ties']"))\
    .withColumn("fpts", expr("settings['fpts'] + settings['fpts_decimal'] / 100")) \
    .withColumn("fpts_against", expr("settings['fpts_against'] + settings['fpts_against_decimal'] / 100")) \
    .withColumn("total_moves", expr("settings['total_moves']")) \
    .withColumn("waiver_budget_used", expr("settings['waiver_budget_used']")) \
    .withColumn("waiver_position", expr("settings['waiver_position']"))

df = df.select(
    "owner_id", 
    "roster_id",
    "streak",
    "record",
    "wins",
    "losses",
    "ties",
    "fpts",
    "fpts_against",
    "total_moves",
    "waiver_budget_used",
    "waiver_position",
    "_league_id",
    "_matchup_week",
    "_year",
    "_ingested_ts",
)

display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_matchups

In [0]:
df = spark.read.table('sleeper.bronze_matchups')\
    .select(
        "matchup_id",
        "roster_id",
        "points",
        "_league_id",
        "_matchup_week",
        "_year",
        "_ingested_ts"
    ).withColumn("_snapshot_ts", current_timestamp())

In [0]:
%sql

SELECT * FROM sleeper.bronze_matchups

In [0]:
from pyspark.sql.functions import array_contains, col, explode

df = spark.read.table('sleeper.bronze_matchups') \
    .withColumn("player_id", explode(col("players"))) \
    .withColumn("is_starter", array_contains(col("starters"), col("player_id"))) \
    .withColumn("player_points", col("players_points")[col("player_id")])

df = df.select(
    "roster_id",
    "matchup_id",
    "player_id",
    "player_points",
    "is_starter",
    "_league_id",
    "_matchup_week",
    "_year",
    "_ingested_ts",
)

display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_users

In [0]:
df = spark.read.table('sleeper.bronze_users')\
    .withColumnRenamed("display_name", "owner_name")\
    .withColumnRenamed("user_id", "owner_id")\
    .withColumnRenamed("is_owner", "is_commissioner")\
    .withColumn("team_name", col("metadata.team_name"))

df = df.select(
    "owner_id",
    "owner_name",
    "is_bot",
    "is_commissioner",
    "team_name",
    "_league_id",
    "_matchup_week",
    "_year",
    "_ingested_ts"
)

display(df)

In [0]:
%sql

SELECT * FROM sleeper.silver_matchups_players_dim

In [0]:
from pyspark.sql.functions import col, concat_ws, coalesce, when

df_matchups_players_dim = spark.read.table('sleeper.silver_matchups_players_dim')
df_players_dim = spark.read.table('sleeper.silver_players_dim')

df_players_dim = df_players_dim.select(
    "player_id",
    "_league_id",
    "_matchup_week",
    coalesce(col("full_name"), col("last_name")).alias("player_name"),
    col("position").alias("player_position"),
    col("team").alias("nfl_team"),
    "years_exp",
    "injury_status",
    concat_ws(" ", col("injury_body_part"), col("injury_notes")).alias("injury_notes"),
    "college",
    when(col("years_exp") == 1, True).otherwise(False).alias("is_rookie")
)

df_joined = df_matchups_players_dim.join(
    df_players_dim,
    (df_matchups_players_dim.player_id == df_players_dim.player_id) &
    (df_matchups_players_dim._league_id == df_players_dim._league_id) &
    (df_matchups_players_dim._matchup_week == df_players_dim._matchup_week)
).select(
    "roster_id",
    "matchup_id",
    df_matchups_players_dim.player_id,
    "player_name",
    "player_position",
    "nfl_team",
    "player_points",
    "is_starter",
    "years_exp",
    "is_rookie",
    "injury_status",
    "injury_notes",
    "college",
    df_matchups_players_dim._league_id,
    df_matchups_players_dim._matchup_week,
    df_matchups_players_dim._year
)

display(df_joined)

In [0]:
%sql

SELECT * FROM sleeper.silver_matchups_fact

In [0]:
df_matchups_fact = spark.read.table('sleeper.silver_matchups_fact')
df_rosters_dim = spark.read.table('sleeper.silver_rosters_dim')
df_users_dim = spark.read.table('sleeper.silver_users_dim')

df_result = df_matchups_fact.join(
    df_rosters_dim,
    (df_matchups_fact._league_id == df_rosters_dim._league_id) &
    (df_matchups_fact.roster_id == df_rosters_dim.roster_id) &
    (df_matchups_fact._matchup_week == df_rosters_dim._matchup_week)
).join(
    df_users_dim,
    (df_matchups_fact._league_id == df_users_dim._league_id) &
    (df_rosters_dim.owner_id == df_users_dim.owner_id) &
    (df_matchups_fact._matchup_week == df_users_dim._matchup_week)
).select(
    "matchup_id",
    df_matchups_fact.roster_id,
    df_users_dim.owner_id,
    df_users_dim.owner_name,
    df_users_dim.is_commissioner,
    df_users_dim.team_name,
    df_matchups_fact.points,
    df_rosters_dim.streak,
    df_rosters_dim.record,
    df_rosters_dim.wins,
    df_rosters_dim.losses,
    df_rosters_dim.ties,
    df_rosters_dim.fpts,
    df_rosters_dim.fpts_against,
    df_rosters_dim.waiver_budget_used,
    df_rosters_dim.waiver_position,
    df_matchups_fact._league_id,
    df_matchups_fact._matchup_week,
    df_matchups_fact._year
)

display(df_result)

In [0]:
%sql

SELECT * FROM sleeper.silver_model_roster_results

In [0]:
%sql

SELECT * FROM sleeper.silver_model_player_performances

In [0]:
df_roster_results = spark.read.table('sleeper.silver_model_roster_results')
df_player_performances = spark.read.table('sleeper.silver_model_player_performances')

df_joined_results = df_roster_results.join(
    df_player_performances,
    (df_roster_results.roster_id == df_player_performances.roster_id) &
    (df_roster_results.matchup_id == df_player_performances.matchup_id) &
    (df_roster_results._matchup_week == df_player_performances._matchup_week)
)

df_joined_results.printSchema()