# Min Points Analysis with Percentiles

This notebook tests the enhanced min points analysis that:
1. Uses correct slate_id from draft_groups table
2. Uses month-based path filtering to minimize data loading
3. Calculates percentiles based on lineup_rank (not points directly)

After validation, this logic will be integrated into `min_points_processor.py`

In [6]:
import duckdb
import os
import pandas as pd
from datetime import datetime

# Get Wasabi credentials from environment
wasabi_endpoint = os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')
wasabi_access_key = os.getenv('WASABI_ACCESS_KEY')
wasabi_secret_key = os.getenv('WASABI_SECRET_KEY')
bucket_name = os.getenv('WASABI_BUCKET_NAME')

# Create DuckDB connection
con = duckdb.connect()

# Configure S3 settings
con.execute(f"""
    SET s3_endpoint='{wasabi_endpoint}';
    SET s3_access_key_id='{wasabi_access_key}';
    SET s3_secret_access_key='{wasabi_secret_key}';
    SET s3_url_style='path';
    SET preserve_insertion_order = false;
    SET enable_progress_bar = true;
""")

print("✓ DuckDB configured with S3 credentials")
print(f"Endpoint: {wasabi_endpoint}")
print(f"Bucket: {bucket_name}")

✓ DuckDB configured with S3 credentials
Endpoint: s3.us-east-2.wasabisys.com
Bucket: dfscrunch-data-lake


In [7]:
# Parameters
sport = 'NFL'
# Test with a specific date range (uses month-based path filtering)
start_date = '2023-09-07'
end_date = '2023-09-10'

# Build date pattern for path filtering (like in min_points_processor.py)
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")

# Since both dates in same month, use simple pattern
if start.year == end.year and start.month == end.month:
    date_pattern = f"{start.year}-{start.month:02d}-*"
else:
    # For multiple months, would need list of patterns
    date_pattern = "*"

# Convert to date_id format for SQL filter
date_id_start = start_date.replace("-", "")
date_id_end = end_date.replace("-", "")

print(f"Sport: {sport}")
print(f"Date range: {start_date} to {end_date}")
print(f"Path pattern: {date_pattern}")
print(f"Date ID range: {date_id_start} to {date_id_end}")

Sport: NFL
Date range: 2023-09-07 to 2023-09-10
Path pattern: 2023-09-*
Date ID range: 20230907 to 20230910


In [8]:
# Build S3 paths with month-based filtering
dds_contests_path = f"s3://{bucket_name}/dds/{sport}/contests/*/{date_pattern}/data.parquet"
dds_draft_groups_path = f"s3://{bucket_name}/dds/{sport}/draft_groups/*/{date_pattern}/data.parquet"
lineups_path = f"s3://{bucket_name}/dds/{sport}/lineups/*/{date_pattern}/data.parquet"

print(f"Contests path: {dds_contests_path}")
print(f"Draft groups path: {dds_draft_groups_path}")
print(f"Lineups path: {lineups_path}")

Contests path: s3://dfscrunch-data-lake/dds/NFL/contests/*/2023-09-*/data.parquet
Draft groups path: s3://dfscrunch-data-lake/dds/NFL/draft_groups/*/2023-09-*/data.parquet
Lineups path: s3://dfscrunch-data-lake/dds/NFL/lineups/*/2023-09-*/data.parquet


## Main Query: Enhanced Min Points with Rank-Based Percentiles

**Key Logic:**
- Percentiles are calculated based on **lineup_rank** (position in contest)
- For example, if a contest has 1000 entries:
  - Top 5% = rank 50 (1000 * 0.05)
  - Top 10% = rank 100 (1000 * 0.10)
  - Top 15% = rank 150 (1000 * 0.15)
- We find the points scored by the lineup at each percentile rank

In [9]:
query = f"""
WITH contests AS (
    SELECT
        contest_id,
        contest_group_id AS draft_group_id,
        cash_line,
        date_id
    FROM read_parquet('{dds_contests_path}', union_by_name=true)
    WHERE is_largest_by_size = TRUE
        AND date_id >= '{date_id_start}'
        AND date_id <= '{date_id_end}'
),
draft_groups AS (
    SELECT
        draft_group_id,
        draft_group_reference_id AS slate_id
    FROM read_parquet('{dds_draft_groups_path}', union_by_name=true)
),
lineups_cashing AS (
    SELECT
        contest_id,
        MIN(points) AS min_points,
        MAX(points) AS max_points,
        MAX(lineup_rank) AS max_lineup_rank
    FROM read_parquet('{lineups_path}', union_by_name=true)
    WHERE is_cashing = TRUE
    GROUP BY contest_id
),
percentile_ranks AS (
    SELECT
        contest_id,
        MAX(lineup_rank) AS total_entries,
        -- Calculate the lineup_rank cutoffs for each percentile
        CAST(CEIL(MAX(lineup_rank) * 0.05) AS INTEGER) AS rank_5_percentile,
        CAST(CEIL(MAX(lineup_rank) * 0.10) AS INTEGER) AS rank_10_percentile,
        CAST(CEIL(MAX(lineup_rank) * 0.15) AS INTEGER) AS rank_15_percentile
    FROM read_parquet('{lineups_path}', union_by_name=true)
    GROUP BY contest_id
),
percentile_points AS (
    SELECT
        l.contest_id,
        -- Get points for lineup at 5th percentile rank
        MAX(CASE WHEN l.lineup_rank = pr.rank_5_percentile THEN l.points END) AS top_5_percentile_points,
        -- Get points for lineup at 10th percentile rank
        MAX(CASE WHEN l.lineup_rank = pr.rank_10_percentile THEN l.points END) AS top_10_percentile_points,
        -- Get points for lineup at 15th percentile rank
        MAX(CASE WHEN l.lineup_rank = pr.rank_15_percentile THEN l.points END) AS top_15_percentile_points
    FROM read_parquet('{lineups_path}', union_by_name=true) l
    INNER JOIN percentile_ranks pr ON l.contest_id = pr.contest_id
    WHERE l.lineup_rank IN (pr.rank_5_percentile, pr.rank_10_percentile, pr.rank_15_percentile)
    GROUP BY l.contest_id
)
SELECT DISTINCT
    draft_groups.slate_id,
    contests.contest_id,
    contests.cash_line,
    contests.date_id,
    lineups_cashing.min_points,
    lineups_cashing.max_points,
    lineups_cashing.max_lineup_rank,
    percentile_points.top_5_percentile_points,
    percentile_points.top_10_percentile_points,
    percentile_points.top_15_percentile_points
FROM contests
LEFT JOIN lineups_cashing ON contests.contest_id = lineups_cashing.contest_id
JOIN draft_groups ON contests.draft_group_id = draft_groups.draft_group_id
LEFT JOIN percentile_points ON contests.contest_id = percentile_points.contest_id
ORDER BY contests.date_id DESC, draft_groups.slate_id, contests.contest_id
"""

print("Executing query...")
result_df = con.execute(query).df()
print("✓ Query completed")

Executing query...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ Query completed


## Display and Validate Results

In [10]:
print(f"Total records: {len(result_df)}")
print(f"\nRecords with min_points: {result_df['min_points'].notna().sum()}")
print(f"Records with percentile data: {result_df['top_10_percentile_points'].notna().sum()}")
print(f"\nSample data:")
result_df.head(10)

Total records: 16

Records with min_points: 16
Records with percentile data: 9

Sample data:


Unnamed: 0,slate_id,contest_id,cash_line,date_id,min_points,max_points,max_lineup_rank,top_5_percentile_points,top_10_percentile_points,top_15_percentile_points
0,89525,147325032,64208,20230910,135.12,229.94,64207,161.94,152.84,146.38
1,89525,147325137,145,20230910,147.76,200.04,145,165.02,156.54,150.92
2,89525,147325139,5620,20230910,144.36,217.72,5620,163.72,155.42,149.34
3,89940,149838753,11535,20230910,146.92,219.34,11535,167.28,158.48,152.1
4,89958,149962988,5805,20230910,62.3,115.66,5803,,87.07,
5,89958,149962989,3,20230910,59.39,77.52,3,77.52,61.78,59.39
6,89958,149962990,32,20230910,61.78,110.06,31,95.32,,65.96
7,89958,149962993,760,20230910,60.18,119.2,760,,,69.02
8,89958,149962995,19175,20230910,61.37,115.8,19173,,,
9,89958,149963015,51,20230910,60.38,110.06,50,93.07,82.18,67.12


In [None]:
# Show full dataframe with better formatting
result_df

## Data Quality Checks

In [11]:
# Check for any issues
print("=" * 60)
print("DATA QUALITY CHECKS")
print("=" * 60)
print(f"1. Null slate_ids: {result_df['slate_id'].isna().sum()}")
print(f"2. Null contest_ids: {result_df['contest_id'].isna().sum()}")
print(f"3. Missing min_points: {result_df['min_points'].isna().sum()}")
print(f"4. Missing percentile data: {result_df['top_10_percentile_points'].isna().sum()}")

# Verify percentile logic (top_5 should be >= top_10 >= top_15 >= min_points)
# Note: min_points is from cashing lineups only, so may be higher than percentiles
invalid_percentiles = result_df[
    (result_df['top_5_percentile_points'].notna()) &
    (result_df['top_10_percentile_points'].notna()) &
    (result_df['top_15_percentile_points'].notna()) &
    ~(
        (result_df['top_5_percentile_points'] >= result_df['top_10_percentile_points']) &
        (result_df['top_10_percentile_points'] >= result_df['top_15_percentile_points'])
    )
]
print(f"\n5. Invalid percentile ordering (5% < 10% or 10% < 15%): {len(invalid_percentiles)}")
if len(invalid_percentiles) > 0:
    print("   WARNING: Some records have invalid percentile ordering!")
    print(invalid_percentiles[['contest_id', 'top_5_percentile_points', 
                                'top_10_percentile_points', 'top_15_percentile_points']])

print("\n" + "=" * 60)

DATA QUALITY CHECKS
1. Null slate_ids: 0
2. Null contest_ids: 0
3. Missing min_points: 0
4. Missing percentile data: 7

5. Invalid percentile ordering (5% < 10% or 10% < 15%): 0

6. Max points should be >= top 5%: 8 / 16



In [12]:
# Show sample of percentile data with rank calculations
print("Percentile Distribution Sample (showing how ranks map to points):")
print("=" * 100)
sample_df = result_df.copy()
sample_df['rank_5pct'] = (sample_df['max_lineup_rank'] * 0.05).apply(lambda x: int(x) if pd.notna(x) else None)
sample_df['rank_10pct'] = (sample_df['max_lineup_rank'] * 0.10).apply(lambda x: int(x) if pd.notna(x) else None)
sample_df['rank_15pct'] = (sample_df['max_lineup_rank'] * 0.15).apply(lambda x: int(x) if pd.notna(x) else None)

sample_cols = ['contest_id', 'max_lineup_rank', 
               'rank_5pct', 'top_5_percentile_points',
               'rank_10pct', 'top_10_percentile_points', 
               'rank_15pct', 'top_15_percentile_points',
               'min_points', 'max_points']
sample_df[sample_cols].head(15)

Percentile Distribution Sample (showing how ranks map to points):


Unnamed: 0,contest_id,max_lineup_rank,rank_5pct,top_5_percentile_points,rank_10pct,top_10_percentile_points,rank_15pct,top_15_percentile_points,min_points,max_points
0,147325032,64207,3210,161.94,6420,152.84,9631,146.38,135.12,229.94
1,147325137,145,7,165.02,14,156.54,21,150.92,147.76,200.04
2,147325139,5620,281,163.72,562,155.42,843,149.34,144.36,217.72
3,149838753,11535,576,167.28,1153,158.48,1730,152.1,146.92,219.34
4,149962988,5803,290,,580,87.07,870,,62.3,115.66
5,149962989,3,0,77.52,0,61.78,0,59.39,59.39,77.52
6,149962990,31,1,95.32,3,,4,65.96,61.78,110.06
7,149962993,760,38,,76,,114,69.02,60.18,119.2
8,149962995,19173,958,,1917,,2875,,61.37,115.8
9,149963015,50,2,93.07,5,82.18,7,67.12,60.38,110.06
