In [0]:
df = spark.read.table('sleeper.bronze_players')

In [0]:
display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_rosters

In [0]:
from pyspark.sql.functions import current_timestamp, expr, explode

In [0]:
df = spark.read.table("sleeper.bronze_rosters") \
    .withColumn("player_id", explode("players")) \
    .withColumn("is_starter", expr("array_contains(starters, player_id)")) \
    .withColumn("player_nickname", expr("metadata['p_nick_' || player_id]"))

df = df.select("owner_id", "roster_id", "player_id", "is_starter", "player_nickname", "_league_id", "_matchup_week", "_year", "_ingested_ts")\
    .withColumn("_snapshot_ts", current_timestamp())
display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_rosters

In [0]:
df = spark.read.table("sleeper.bronze_rosters") \
    .withColumn("streak", expr("metadata['streak']")) \
    .withColumn("record", expr("metadata['record']"))\
    .withColumn("wins", expr("settings['wins']"))\
    .withColumn("losses", expr("settings['losses']"))\
    .withColumn("ties", expr("settings['ties']"))\
    .withColumn("fpts", expr("settings['fpts'] + settings['fpts_decimal'] / 100")) \
    .withColumn("fpts_against", expr("settings['fpts_against'] + settings['fpts_against_decimal'] / 100")) \
    .withColumn("total_moves", expr("settings['total_moves']")) \
    .withColumn("waiver_budget_used", expr("settings['waiver_budget_used']")) \
    .withColumn("waiver_position", expr("settings['waiver_position']"))

df = df.select(
    "owner_id", 
    "roster_id",
    "streak",
    "record",
    "wins",
    "losses",
    "ties",
    "fpts",
    "fpts_against",
    "total_moves",
    "waiver_budget_used",
    "waiver_position",
    "_league_id",
    "_matchup_week",
    "_year",
    "_ingested_ts",
)

display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_matchups

In [0]:
df = spark.read.table('sleeper.bronze_matchups')\
    .select(
        "matchup_id",
        "roster_id",
        "points",
        "_league_id",
        "_matchup_week",
        "_year",
        "_ingested_ts"
    ).withColumn("_snapshot_ts", current_timestamp())

In [0]:
%sql

SELECT * FROM sleeper.bronze_matchups

In [0]:
from pyspark.sql.functions import array_contains, col, explode

df = spark.read.table('sleeper.bronze_matchups') \
    .withColumn("player_id", explode(col("players"))) \
    .withColumn("is_starter", array_contains(col("starters"), col("player_id"))) \
    .withColumn("player_points", col("players_points")[col("player_id")])

df = df.select(
    "roster_id",
    "matchup_id",
    "player_id",
    "player_points",
    "is_starter",
    "_league_id",
    "_matchup_week",
    "_year",
    "_ingested_ts",
)

display(df)

In [0]:
%sql

SELECT * FROM sleeper.bronze_users

In [0]:
df = spark.read.table('sleeper.bronze_users')\
    .withColumnRenamed("display_name", "owner_name")\
    .withColumnRenamed("user_id", "owner_id")\
    .withColumnRenamed("is_owner", "is_commissioner")\
    .withColumn("team_name", col("metadata.team_name"))

df = df.select(
    "owner_id",
    "owner_name",
    "is_bot",
    "is_commissioner",
    "team_name",
    "_league_id",
    "_matchup_week",
    "_year",
    "_ingested_ts"
)

display(df)

In [0]:
%sql

SELECT * FROM sleeper.silver_matchups_players_dim

In [0]:
from pyspark.sql.functions import col, concat_ws, coalesce, when
from pyspark.sql.window import Window
import pyspark.sql.functions as F

df_matchups_players_dim = spark.read.table('sleeper.silver_matchups_players_dim')
df_players_dim = spark.read.table('sleeper.silver_players_dim')

df_players_dim = df_players_dim.select(
    "player_id",
    "_league_id",
    "_matchup_week",
    coalesce(col("full_name"), col("last_name")).alias("player_name"),
    col("position").alias("player_position"),
    col("team").alias("nfl_team"),
    "years_exp",
    "injury_status",
    concat_ws(" ", col("injury_body_part"), col("injury_notes")).alias("injury_notes"),
    "college",
    when(col("years_exp") == 1, True).otherwise(False).alias("is_rookie")
)

df_joined = df_matchups_players_dim.join(
    df_players_dim,
    (df_matchups_players_dim.player_id == df_players_dim.player_id) &
    (df_matchups_players_dim._league_id == df_players_dim._league_id) &
    (df_matchups_players_dim._matchup_week == df_players_dim._matchup_week)
)

window_spec = Window.partitionBy(
    df_matchups_players_dim["_league_id"],
    df_matchups_players_dim["_matchup_week"],
    "player_position"
).orderBy("player_points")

df_joined = df_joined.withColumn("position_points_percentile", F.percent_rank().over(window_spec))

df_joined = df_joined.select(
    "roster_id",
    "matchup_id",
    df_matchups_players_dim.player_id,
    "player_name",
    "player_position",
    "nfl_team",
    "player_points",
    "position_points_percentile",
    "is_starter",
    "years_exp",
    "is_rookie",
    "injury_status",
    "injury_notes",
    "college",
    df_matchups_players_dim._league_id,
    df_matchups_players_dim._matchup_week,
    df_matchups_players_dim._year
)

display(df_joined)

In [0]:
%sql

SELECT * FROM sleeper.silver_matchups_fact

In [0]:
df_matchups_fact = spark.read.table('sleeper.silver_matchups_fact')
df_rosters_dim = spark.read.table('sleeper.silver_rosters_dim')
df_users_dim = spark.read.table('sleeper.silver_users_dim')

df_result = df_matchups_fact.join(
    df_rosters_dim,
    (df_matchups_fact._league_id == df_rosters_dim._league_id) &
    (df_matchups_fact.roster_id == df_rosters_dim.roster_id) &
    (df_matchups_fact._matchup_week == df_rosters_dim._matchup_week)
).join(
    df_users_dim,
    (df_matchups_fact._league_id == df_users_dim._league_id) &
    (df_rosters_dim.owner_id == df_users_dim.owner_id) &
    (df_matchups_fact._matchup_week == df_users_dim._matchup_week)
).select(
    "matchup_id",
    df_matchups_fact.roster_id,
    df_users_dim.owner_id,
    df_users_dim.owner_name,
    df_users_dim.is_commissioner,
    df_users_dim.team_name,
    df_matchups_fact.points,
    df_rosters_dim.streak,
    df_rosters_dim.record,
    df_rosters_dim.wins,
    df_rosters_dim.losses,
    df_rosters_dim.ties,
    df_rosters_dim.fpts,
    df_rosters_dim.fpts_against,
    df_rosters_dim.waiver_budget_used,
    df_rosters_dim.waiver_position,
    df_matchups_fact._league_id,
    df_matchups_fact._matchup_week,
    df_matchups_fact._year
)

display(df_result)

In [0]:
%sql

SELECT * FROM sleeper.silver_model_roster_results

In [0]:
%sql

SELECT * FROM sleeper.silver_model_player_performances

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col

df_roster_results = spark.read.table('sleeper.silver_model_roster_results')
df_player_performances = spark.read.table('sleeper.silver_model_player_performances')

df_joined_results = df_roster_results.alias("roster").join(
    df_player_performances.alias("performance"),
    (col("roster.roster_id") == col("performance.roster_id")) &
    (col("roster.matchup_id") == col("performance.matchup_id")) &
    (col("roster._matchup_week") == col("performance._matchup_week"))
)

# Aggregate starter points and bench points
df_aggregated = df_joined_results.groupBy(
    "roster.matchup_id",
    "roster.roster_id",
    "roster.owner_id",
    "roster.owner_name",
    "roster.is_commissioner",
    "roster.team_name",
    "roster.matchup_points",
    "roster.team_streak",
    "roster.team_record",
    "roster.team_total_wins",
    "roster.team_total_losses",
    "roster.team_total_ties",
    "roster.team_total_fpts",
    "roster.team_total_fpts_against",
    "roster.team_waiver_budget_used",
    "roster.team_waiver_position",
    "roster._league_id",
    "roster._matchup_week",
    "roster._year"
).agg(
    F.sum(F.when(col("performance.is_starter"), col("performance.player_points")).otherwise(0)).alias("starter_points"),
    F.sum(F.when(~col("performance.is_starter"), col("performance.player_points")).otherwise(0)).alias("bench_points")
)

# Compute roster strength
df_aggregated = df_aggregated.withColumn(
    "roster_strength",
    col("starter_points") + (0.2 * col("bench_points"))
)

# Find bench players who scored more than starters in the same position
window_spec = Window.partitionBy(
    "performance.matchup_id", "performance.roster_id", "performance.player_position"
).orderBy(F.desc("performance.player_points"))

df_ranked = df_joined_results.withColumn("rank", F.rank().over(window_spec))

df_bench_better_than_starters = df_ranked.filter(
    (col("rank") == 1) & (~col("performance.is_starter"))
).select(
    "performance.matchup_id",
    "performance.roster_id",
    "performance.player_id",
    "performance.player_name",
    "performance.player_position",
    "performance.player_points"
)

# Join back to find the corresponding starter
df_starters = df_joined_results.filter(col("performance.is_starter")).select(
    "performance.matchup_id",
    "performance.roster_id",
    "performance.player_position",
    col("performance.player_name").alias("starter_player_name"),
    col("performance.player_points").alias("starter_player_points")
)

df_bench_better_than_starters = df_bench_better_than_starters.alias("bench").join(
    df_starters.alias("starters"),
    (col("bench.matchup_id") == col("starters.matchup_id")) &
    (col("bench.roster_id") == col("starters.roster_id")) &
    (col("bench.player_position") == col("starters.player_position"))
).select(
    col("bench.matchup_id"),
    col("bench.roster_id"),
    F.struct(
        col("bench.player_name").alias("benched_player_name"),
        col("bench.player_points").alias("benched_player_points"),
        col("starters.starter_player_name"),
        col("starters.starter_player_points"),
        (col("bench.player_points") - col("starters.starter_player_points")).alias("point_opportunity_cost")
    ).alias("bench_better_than_starter")
)

# Aggregate the structs into a list and sum point_opportunity_cost
df_bench_better_than_starters_agg = df_bench_better_than_starters.groupBy(
    "matchup_id", "roster_id"
).agg(
    F.collect_list("bench_better_than_starter").alias("bench_better_than_starters"),
    F.sum("bench_better_than_starter.point_opportunity_cost").alias("missed_starter_points")
)

# Join the aggregated bench better than starters to df_aggregated
df_aggregated = df_aggregated.alias("aggregated").join(
    df_bench_better_than_starters_agg.alias("bench_agg"),
    ["matchup_id", "roster_id"],
    "left"
)

# Create a column for highest scoring players (top 3 performing players)
window_spec_highest_scoring = Window.partitionBy("roster.matchup_id", "roster.roster_id").orderBy(F.desc("performance.player_points"))

df_highest_scoring = df_joined_results.withColumn("rank", F.row_number().over(window_spec_highest_scoring)).filter(col("rank") <= 3)

df_highest_scoring_agg = df_highest_scoring.groupBy("roster.matchup_id", "roster.roster_id").agg(
    F.collect_list(
        F.struct(
            col("performance.player_name").alias("highest_scoring_player_name"),
            col("performance.player_points").alias("highest_scoring_player_points")
        )
    ).alias("highest_scoring_players")
)

df_aggregated = df_aggregated.join(
    df_highest_scoring_agg.alias("high_score_agg"),
    ["matchup_id", "roster_id"],
    "left"
)

# Add opponent points
df_opponent_points = df_aggregated.select(
    col("matchup_id"),
    col("roster_id").alias("opponent_roster_id"),
    col("starter_points").alias("opponent_starter_points")
)

df_aggregated = df_aggregated.alias("aggregated").join(
    df_opponent_points.alias("opponent"),
    (col("aggregated.matchup_id") == col("opponent.matchup_id")) &
    (col("aggregated.roster_id") != col("opponent.opponent_roster_id")),
    "left"
)

# Add coulve_won_with_bench column
df_aggregated = df_aggregated.withColumn(
    "couldve_won_with_missed_bench_points",
    F.when(
        col("starter_points") > col("opponent_starter_points"),
        None
    ).when(
        (col("starter_points") + col("missed_starter_points")) > col("opponent_starter_points"),
        True
    ).otherwise(False)
)

# Add manager_efficiency column
df_aggregated = df_aggregated.withColumn(
    "manager_efficiency",
    col("starter_points") / (col("starter_points") + F.coalesce(col("missed_starter_points"), F.lit(0)))
)

# Normalize roster_strength and manager_efficiency using percentile rank
window_spec_percentile = Window.partitionBy("aggregated._league_id", "aggregated._matchup_week").orderBy("roster_strength")
df_aggregated = df_aggregated.withColumn("roster_strength_percentile", F.percent_rank().over(window_spec_percentile))

# Create week_power_points column
df_aggregated = df_aggregated.withColumn(
    "week_power_points",
    (col("roster_strength_percentile") * 0.8) + (col("manager_efficiency") * 0.2)
)

df_selected = df_aggregated.select(
    "aggregated._league_id",
    "aggregated.matchup_id",
    "aggregated.roster_id",
    "aggregated.owner_id",
    "aggregated.owner_name",
    "aggregated.is_commissioner",
    "aggregated.team_name",
    "aggregated.team_streak",
    "aggregated.team_record",
    "aggregated.team_total_wins",
    "aggregated.team_total_losses",
    "aggregated.team_total_ties",
    "aggregated.team_total_fpts",
    "aggregated.team_total_fpts_against",
    "aggregated.team_waiver_budget_used",
    "aggregated.matchup_points",
    "starter_points",
    "bench_points",
    "bench_better_than_starters",
    "missed_starter_points",
    "couldve_won_with_missed_bench_points",
    "highest_scoring_players",
    "roster_strength",
    "roster_strength_percentile",
    "manager_efficiency",
    "week_power_points",
    "aggregated._matchup_week",
    "aggregated._year"
)

display(df_selected)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col

df_roster_results = spark.read.table('sleeper.silver_model_roster_results')
df_player_performances = spark.read.table('sleeper.silver_model_player_performances')

df_joined_results = df_roster_results.alias("roster").join(
    df_player_performances.alias("performance"),
    (col("roster.roster_id") == col("performance.roster_id")) &
    (col("roster.matchup_id") == col("performance.matchup_id")) &
    (col("roster._matchup_week") == col("performance._matchup_week"))
)

df_aggregated = df_joined_results.groupBy(
    "roster.matchup_id", "roster.roster_id", "roster.owner_id", "roster.owner_name",
    "roster.is_commissioner", "roster.team_name", "roster.matchup_points", "roster.team_streak",
    "roster.team_record", "roster.team_total_wins", "roster.team_total_losses", "roster.team_total_ties",
    "roster.team_total_fpts", "roster.team_total_fpts_against", "roster.team_waiver_budget_used",
    "roster.team_waiver_position", "roster._league_id", "roster._matchup_week", "roster._year"
).agg(
    F.sum(F.when(col("performance.is_starter"), col("performance.player_points")).otherwise(0)).alias("starter_points"),
    F.sum(F.when(~col("performance.is_starter"), col("performance.player_points")).otherwise(0)).alias("bench_points")
).withColumn(
    "roster_strength", col("starter_points") + (0.2 * col("bench_points"))
)

window_spec = Window.partitionBy("performance.matchup_id", "performance.roster_id", "performance.player_position").orderBy(F.desc("performance.player_points"))
df_ranked = df_joined_results.withColumn("rank", F.rank().over(window_spec))

df_bench_better_than_starters = df_ranked.filter(
    (col("rank") == 1) & (~col("performance.is_starter"))
).select(
    "performance.matchup_id", "performance.roster_id", "performance.player_id",
    "performance.player_name", "performance.player_position", "performance.player_points"
)

df_starters = df_joined_results.filter(col("performance.is_starter")).select(
    "performance.matchup_id", "performance.roster_id", "performance.player_position",
    col("performance.player_name").alias("starter_player_name"),
    col("performance.player_points").alias("starter_player_points")
)

df_bench_better_than_starters = df_bench_better_than_starters.alias("bench").join(
    df_starters.alias("starters"),
    (col("bench.matchup_id") == col("starters.matchup_id")) &
    (col("bench.roster_id") == col("starters.roster_id")) &
    (col("bench.player_position") == col("starters.player_position"))
).select(
    col("bench.matchup_id"), col("bench.roster_id"),
    F.struct(
        col("bench.player_name").alias("benched_player_name"),
        col("bench.player_points").alias("benched_player_points"),
        col("starters.starter_player_name"), col("starters.starter_player_points"),
        (col("bench.player_points") - col("starters.starter_player_points")).alias("point_opportunity_cost")
    ).alias("bench_better_than_starter")
)

df_bench_better_than_starters_agg = df_bench_better_than_starters.groupBy(
    "matchup_id", "roster_id"
).agg(
    F.collect_list("bench_better_than_starter").alias("bench_better_than_starters"),
    F.sum("bench_better_than_starter.point_opportunity_cost").alias("missed_starter_points")
)

df_aggregated = df_aggregated.alias("aggregated").join(
    df_bench_better_than_starters_agg.alias("bench_agg"),
    ["matchup_id", "roster_id"], "left"
)

window_spec_highest_scoring = Window.partitionBy("roster.matchup_id", "roster.roster_id").orderBy(F.desc("performance.player_points"))
df_highest_scoring = df_joined_results.withColumn("rank", F.row_number().over(window_spec_highest_scoring)).filter(col("rank") <= 3)

df_highest_scoring_agg = df_highest_scoring.groupBy("roster.matchup_id", "roster.roster_id").agg(
    F.collect_list(
        F.struct(
            col("performance.player_name").alias("highest_scoring_player_name"),
            col("performance.player_points").alias("highest_scoring_player_points")
        )
    ).alias("highest_scoring_players")
)

df_aggregated = df_aggregated.join(
    df_highest_scoring_agg.alias("high_score_agg"),
    ["matchup_id", "roster_id"], "left"
)

df_opponent_points = df_aggregated.select(
    col("matchup_id"), col("roster_id").alias("opponent_roster_id"), col("starter_points").alias("opponent_starter_points")
)

df_aggregated = df_aggregated.alias("aggregated").join(
    df_opponent_points.alias("opponent"),
    (col("aggregated.matchup_id") == col("opponent.matchup_id")) &
    (col("aggregated.roster_id") != col("opponent.opponent_roster_id")),
    "left"
).withColumn(
    "couldve_won_with_missed_bench_points",
    F.when(col("starter_points") > col("opponent_starter_points"), None)
    .when((col("starter_points") + col("missed_starter_points")) > col("opponent_starter_points"), True)
    .otherwise(False)
).withColumn(
    "manager_efficiency",
    col("starter_points") / (col("starter_points") + F.coalesce(col("missed_starter_points"), F.lit(0)))
)

window_spec_percentile = Window.partitionBy("aggregated._league_id", "aggregated._matchup_week").orderBy("roster_strength")
df_aggregated = df_aggregated.withColumn("roster_strength_percentile", F.percent_rank().over(window_spec_percentile)).withColumn(
    "week_power_points",
    (col("roster_strength_percentile") * 0.8) + (col("manager_efficiency") * 0.2)
)

df_selected = df_aggregated.select(
    "aggregated._league_id", "aggregated.matchup_id", "aggregated.roster_id", "aggregated.owner_id",
    "aggregated.owner_name", "aggregated.is_commissioner", "aggregated.team_name", "aggregated.team_streak",
    "aggregated.team_record", "aggregated.team_total_wins", "aggregated.team_total_losses", "aggregated.team_total_ties",
    "aggregated.team_total_fpts", "aggregated.team_total_fpts_against", "aggregated.team_waiver_budget_used",
    "aggregated.matchup_points", "starter_points", "bench_points", "bench_better_than_starters",
    "missed_starter_points", "couldve_won_with_missed_bench_points", "highest_scoring_players",
    "roster_strength", "roster_strength_percentile", "manager_efficiency", "week_power_points",
    "aggregated._matchup_week", "aggregated._year"
).dropDuplicates(["_league_id", "roster_id", "_matchup_week"])

display(df_selected)

# debug gold duplicates

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col

df_roster_results = spark.read.table('sleeper.silver_model_roster_results').withColumn("source", F.lit("roster"))
df_player_performances = spark.read.table('sleeper.silver_model_player_performances').withColumn("source", F.lit("performance"))

df_joined_results = df_roster_results.alias("roster").join(
    df_player_performances.alias("performance"),
    (col("roster.roster_id") == col("performance.roster_id")) &
    (col("roster.matchup_id") == col("performance.matchup_id")) &
    (col("roster._matchup_week") == col("performance._matchup_week"))
).withColumn("source", F.lit("joined"))

df_aggregated = df_joined_results.groupBy(
    "roster.matchup_id", "roster.roster_id", "roster.owner_id", "roster.owner_name",
    "roster.is_commissioner", "roster.team_name", "roster.matchup_points", "roster.team_streak",
    "roster.team_record", "roster.team_total_wins", "roster.team_total_losses", "roster.team_total_ties",
    "roster.team_total_fpts", "roster.team_total_fpts_against", "roster.team_waiver_budget_used",
    "roster.team_waiver_position", "roster._league_id", "roster._matchup_week", "roster._year"
).agg(
    F.sum(F.when(col("performance.is_starter"), col("performance.player_points")).otherwise(0)).alias("starter_points"),
    F.sum(F.when(~col("performance.is_starter"), col("performance.player_points")).otherwise(0)).alias("bench_points")
).withColumn(
    "roster_strength", col("starter_points") + (0.2 * col("bench_points"))
).withColumn("source", F.lit("aggregated"))

display(df_aggregated)

window_spec = Window.partitionBy("performance.matchup_id", "performance.roster_id", "performance.player_position").orderBy(F.desc("performance.player_points"))
df_ranked = df_joined_results.withColumn("rank", F.rank().over(window_spec)).withColumn("source", F.lit("ranked"))

display(df_ranked)

df_bench_better_than_starters = df_ranked.filter(
    (col("rank") == 1) & (~col("performance.is_starter"))
).select(
    "performance.matchup_id", "performance.roster_id", "performance.player_id",
    "performance.player_name", "performance.player_position", "performance.player_points"
).withColumn("source", F.lit("bench_better_than_starters"))

display(df_bench_better_than_starters)

df_starters = df_joined_results.filter(col("performance.is_starter")).select(
    "performance.matchup_id", "performance.roster_id", "performance.player_position",
    col("performance.player_name").alias("starter_player_name"),
    col("performance.player_points").alias("starter_player_points")
).withColumn("source", F.lit("starters"))

display(df_starters)

df_bench_better_than_starters = df_bench_better_than_starters.alias("bench").join(
    df_starters.alias("starters"),
    (col("bench.matchup_id") == col("starters.matchup_id")) &
    (col("bench.roster_id") == col("starters.roster_id")) &
    (col("bench.player_position") == col("starters.player_position"))
).select(
    col("bench.matchup_id"), col("bench.roster_id"),
    F.struct(
        col("bench.player_name").alias("benched_player_name"),
        col("bench.player_points").alias("benched_player_points"),
        col("starters.starter_player_name"), col("starters.starter_player_points"),
        (col("bench.player_points") - col("starters.starter_player_points")).alias("point_opportunity_cost")
    ).alias("bench_better_than_starter")
).withColumn("source", F.lit("bench_better_than_starters_joined"))

df_bench_better_than_starters_agg = df_bench_better_than_starters.groupBy(
    "matchup_id", "roster_id"
).agg(
    F.collect_list("bench_better_than_starter").alias("bench_better_than_starters"),
    F.sum("bench_better_than_starter.point_opportunity_cost").alias("missed_starter_points")
).withColumn("source", F.lit("bench_better_than_starters_agg"))

display(df_bench_better_than_starters_agg)

df_aggregated = df_aggregated.alias("aggregated").join(
    df_bench_better_than_starters_agg.alias("bench_agg"),
    ["matchup_id", "roster_id"], "left"
).withColumn("source", F.lit("aggregated_with_bench"))

display(df_aggregated)

window_spec_highest_scoring = Window.partitionBy("roster.matchup_id", "roster.roster_id").orderBy(F.desc("performance.player_points"))
df_highest_scoring = df_joined_results.withColumn("rank", F.row_number().over(window_spec_highest_scoring)).filter(col("rank") <= 3).withColumn("source", F.lit("highest_scoring"))

display(df_highest_scoring)

df_highest_scoring_agg = df_highest_scoring.groupBy("roster.matchup_id", "roster.roster_id").agg(
    F.collect_list(
        F.struct(
            col("performance.player_name").alias("highest_scoring_player_name"),
            col("performance.player_points").alias("highest_scoring_player_points")
        )
    ).alias("highest_scoring_players")
).withColumn("source", F.lit("highest_scoring_agg"))

display(df_highest_scoring_agg)

df_aggregated = df_aggregated.join(
    df_highest_scoring_agg.alias("high_score_agg"),
    ["matchup_id", "roster_id"], "left"
).withColumn("source", F.lit("aggregated_with_high_score"))

display(df_aggregated)

df_opponent_points = df_aggregated.select(
    col("matchup_id"), col("roster_id").alias("opponent_roster_id"), col("starter_points").alias("opponent_starter_points")
).withColumn("source", F.lit("opponent_points"))

display(df_opponent_points)

df_aggregated = df_aggregated.alias("aggregated").join(
    df_opponent_points.alias("opponent"),
    (col("aggregated.matchup_id") == col("opponent.matchup_id")) &
    (col("aggregated.roster_id") != col("opponent.opponent_roster_id")),
    "left"
).withColumn(
    "couldve_won_with_missed_bench_points",
    F.when(col("starter_points") > col("opponent_starter_points"), None)
    .when((col("starter_points") + col("missed_starter_points")) > col("opponent_starter_points"), True)
    .otherwise(False)
).withColumn(
    "manager_efficiency",
    col("starter_points") / (col("starter_points") + F.coalesce(col("missed_starter_points"), F.lit(0)))
).withColumn("source", F.lit("aggregated_with_opponent"))

display(df_aggregated)

window_spec_percentile = Window.partitionBy("aggregated._league_id", "aggregated._matchup_week").orderBy("roster_strength")
df_aggregated = df_aggregated.withColumn("roster_strength_percentile", F.percent_rank().over(window_spec_percentile)).withColumn(
    "week_power_points",
    (col("roster_strength_percentile") * 0.8) + (col("manager_efficiency") * 0.2)
).withColumn("source", F.lit("final_aggregated"))

display(df_aggregated)

df_selected = df_aggregated.select(
    "aggregated._league_id", "aggregated.matchup_id", "aggregated.roster_id", "aggregated.owner_id",
    "aggregated.owner_name", "aggregated.is_commissioner", "aggregated.team_name", "aggregated.team_streak",
    "aggregated.team_record", "aggregated.team_total_wins", "aggregated.team_total_losses", "aggregated.team_total_ties",
    "aggregated.team_total_fpts", "aggregated.team_total_fpts_against", "aggregated.team_waiver_budget_used",
    "aggregated.matchup_points", "starter_points", "bench_points", "bench_better_than_starters",
    "missed_starter_points", "couldve_won_with_missed_bench_points", "highest_scoring_players",
    "roster_strength", "roster_strength_percentile", "manager_efficiency", "week_power_points",
    "aggregated._matchup_week", "aggregated._year"
).dropDuplicates(["_league_id", "roster_id", "_matchup_week"])

display(df_selected)

In [0]:
%sql

SELECT * FROM sleeper.gold_weekly_performance_ranks

In [0]:
%sql

SELECT * FROM sleeper.gold_weekly_performance_ranks

In [0]:
%sql
SELECT * FROM sleeper.gold_power_rankings

In [0]:
%sql
SELECT 
  r.matchup_id, 
  r.roster_id, 
  r.owner_id, 
  r.owner_name,
  r.is_commissioner, 
  r.team_name, 
  r.matchup_points, 
  r.team_streak,
  r.team_record, 
  r.team_total_wins, 
  r.team_total_losses, 
  r.team_total_ties,
  r.team_total_fpts, 
  r.team_total_fpts_against, 
  r.team_waiver_budget_used,
  r.team_waiver_position, 
  r._league_id, 
  r._matchup_week, 
  r._year,
  SUM(CASE WHEN p.is_starter THEN p.player_points ELSE 0 END) AS starter_points,
  SUM(CASE WHEN NOT p.is_starter THEN p.player_points ELSE 0 END) AS bench_points,
  SUM(CASE WHEN p.is_starter THEN p.player_points ELSE 0 END) + (0.2 * SUM(CASE WHEN NOT p.is_starter THEN p.player_points ELSE 0 END)) AS roster_strength
FROM sleeper.silver_model_roster_results r
  LEFT JOIN sleeper.silver_model_player_performances p
    ON r.roster_id = p.roster_id
    AND r._league_id = p._league_id
    AND r._matchup_week = p._matchup_week
GROUP BY 
  r.matchup_id, 
  r.roster_id, 
  r.owner_id, 
  r.owner_name,
  r.is_commissioner, 
  r.team_name, 
  r.matchup_points, 
  r.team_streak,
  r.team_record, 
  r.team_total_wins, 
  r.team_total_losses, 
  r.team_total_ties,
  r.team_total_fpts, 
  r.team_total_fpts_against, 
  r.team_waiver_budget_used,
  r.team_waiver_position, 
  r._league_id, 
  r._matchup_week, 
  r._year

In [0]:
%sql

SELECT * FROM sleeper.gold_weekly_performance_ranks

# build exp decay method

In [0]:
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import lit, col, rank
from pyspark.sql.window import Window

src = spark.read.table('sleeper.gold_weekly_performance_ranks')

src = src.withColumn('power_rank_points', lit(None).cast(DoubleType())) \
    .withColumn('power_ranking', lit(None).cast(IntegerType())) \
    .withColumn('power_rank_change', lit(None).cast(IntegerType()))

if spark.catalog.tableExists('sleeper.gold_power_rankings'):
    target = spark.read.table('sleeper.gold_power_rankings')
else:
    target = spark.createDataFrame([], src.schema)

# Alias columns in target
target_alias = target.select(
    col("_league_id").alias("target_league_id"),
    col("_matchup_week").alias("target_matchup_week"),
    col("roster_id").alias("target_roster_id")
)

# Perform the join
src = src.join(
    target_alias,
    (src["_league_id"] == target_alias["target_league_id"]) &
    (src["_matchup_week"] == target_alias["target_matchup_week"]) &
    (src["roster_id"] == target_alias["target_roster_id"]),
    how="left_anti"
).drop("target_league_id", "target_matchup_week", "target_roster_id")

distinct_league_week = src.select("_league_id", "_matchup_week").distinct().orderBy("_league_id", "_matchup_week")

for row in distinct_league_week.collect():
    league_id = row["_league_id"]
    week = row["_matchup_week"]
    print(f"Processing league {league_id} week {week}")

    temp = src.filter((src["_league_id"] == league_id) & (src["_matchup_week"] == week))

    previous_week = week - 1
    previous_week_data = target.filter((target["_league_id"] == league_id) & (target["_matchup_week"] == previous_week)).alias("prev_week")

    if previous_week_data.count() > 0:
        previous_week_data = previous_week_data \
            .select(
                col('roster_id').alias('previous_roster_id'),
                col("power_rank_points").alias('previous_power_rank_points'),
                col("power_ranking").alias('previous_power_ranking')
            )
        temp = temp.join(previous_week_data, temp["roster_id"] == previous_week_data["previous_roster_id"], how="left")
        temp = temp.withColumn(
            "power_rank_points",
            (col("week_power_points") * 0.3) + (col("previous_power_rank_points") * 0.7)
        )
    else:
        temp = temp \
            .withColumn("power_rank_points", col("week_power_points")) \
            .withColumn("power_rank_change", lit(None))

    # Calculate power_ranking
    window_spec = Window.partitionBy("_league_id").orderBy(col("power_rank_points").desc())
    temp = temp.withColumn("power_ranking", rank().over(window_spec))

    if previous_week_data.count() > 0:
        temp = temp.withColumn(
            "power_rank_change",
            col('previous_power_ranking') - col('power_ranking')
        )

        temp = temp.drop('previous_power_rank_points').drop('previous_roster_id').drop('previous_power_ranking')

    # Ensure schemas are compatible
    for col_name, col_type in target.dtypes:
        if col_name in temp.columns:
            temp = temp.withColumn(col_name, temp[col_name].cast(col_type))

    target = target.union(temp)

display(target)

In [0]:
%sql

DROP TABLE sleeper.gold_power_rankings

In [0]:
%sql

SELECT 
  owner_name,
  is_commissioner,
  team_name,
  team_streak,
  team_total_wins,
  team_total_losses,
  team_total_ties,
  team_total_fpts,
  team_total_fpts_against,
  matchup_points,
  missed_starter_points,
  bench_better_than_starters,
  couldve_won_with_missed_bench_points,
  highest_scoring_players,
  power_ranking,
  power_rank_change
FROM sleeper.gold_power_rankings WHERE `_matchup_week` = 14

In [0]:
df = spark.sql("""
SELECT 
  owner_name,
  is_commissioner,
  team_name,
  team_streak,
  team_total_wins,
  team_total_losses,
  team_total_ties,
  team_total_fpts,
  team_total_fpts_against,
  matchup_points,
  missed_starter_points,
  bench_better_than_starters,
  couldve_won_with_missed_bench_points,
  highest_scoring_players,
  power_ranking,
  power_rank_change
FROM sleeper.gold_power_rankings WHERE `_matchup_week` = 14            
""")

df.printSchema()