# DDS DAG Component Testing

This notebook tests the components of the DDS DAG before deploying to Airflow.

In [1]:
import os
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from scripts.date_tracker import DateTracker
from scripts.dds.dds_processing import (
    process_players_to_dds,
    process_users_lineups_to_dds,
    process_lineups_to_dds,
)
from scripts.spiders.rotogrinders_scraper import Sport

## 1. Test DateTracker with Explicit Paths

Verify that DateTracker works with custom tracking paths for both staging and DDS.

In [2]:
# Test configuration
sport: Sport = "NFL"
test_date = "2025-09-07"
bucket_name = os.getenv("WASABI_BUCKET_NAME")

print(f"Testing with sport={sport}, date={test_date}, bucket={bucket_name}")

Testing with sport=NFL, date=2025-09-07, bucket=dfscrunch-data-lake


### 1.1 Test Staging Tracker Path

In [3]:
# Staging tracker (same as used in daily_scraping_dag)
staging_tracking_path = f"s3://{bucket_name}/staging/metadata/{sport}/scraped_dates.json"
print(f"Staging tracking path: {staging_tracking_path}")

staging_tracker = DateTracker(sport=sport, tracking_path=staging_tracking_path)
print(f"Staging dates: {staging_tracker.get_all_scraped()}")
print(f"Is {test_date} scraped in staging? {staging_tracker.is_scraped(test_date)}")

Staging tracking path: s3://dfscrunch-data-lake/staging/metadata/NFL/scraped_dates.json
Staging dates: ['2025-08-31', '2025-09-01', '2025-09-02', '2025-09-03', '2025-09-04', '2025-09-05', '2025-09-06', '2025-09-07', '2025-09-08', '2025-09-09', '2025-09-10', '2025-09-11', '2025-09-12', '2025-09-13', '2025-09-14', '2025-09-15', '2025-09-16', '2025-09-17', '2025-09-18', '2025-09-19', '2025-09-20', '2025-09-21', '2025-09-22', '2025-09-23', '2025-09-24', '2025-09-25', '2025-09-26', '2025-09-27', '2025-09-28', '2025-09-29', '2025-09-30', '2025-10-01', '2025-10-02', '2025-10-03', '2025-10-04', '2025-10-05', '2025-10-06', '2025-10-07', '2025-10-08']
Is 2025-09-07 scraped in staging? True


### 1.2 Test DDS Tracker Paths (Per Table)

In [4]:
# Test each DDS table tracker
dds_tables = ["players", "user_lineups", "lineups"]

for table in dds_tables:
    tracking_path = f"s3://{bucket_name}/dds/metadata/{sport}/{table}/processed_dates.json"
    print(f"\n{table.upper()} tracking path: {tracking_path}")
    
    tracker = DateTracker(sport=sport, tracking_path=tracking_path)
    processed_dates = tracker.get_all_scraped()
    print(f"  Processed dates: {processed_dates}")
    print(f"  Is {test_date} processed? {tracker.is_scraped(test_date)}")


PLAYERS tracking path: s3://dfscrunch-data-lake/dds/metadata/NFL/players/processed_dates.json
  Processed dates: ['2025-09-07']
  Is 2025-09-07 processed? True

USER_LINEUPS tracking path: s3://dfscrunch-data-lake/dds/metadata/NFL/user_lineups/processed_dates.json
  Processed dates: []
  Is 2025-09-07 processed? False

LINEUPS tracking path: s3://dfscrunch-data-lake/dds/metadata/NFL/lineups/processed_dates.json
  Processed dates: ['2025-09-07']
  Is 2025-09-07 processed? True


### 1.3 Verify Path Separation

Ensure staging and DDS tracking files are completely separate.

In [5]:
# This should show different tracking files with different contents
import pandas as pd
import json

storage_options = {
    "client_kwargs": {"endpoint_url": f"https://{os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')}"},
    "key": os.getenv("WASABI_ACCESS_KEY"),
    "secret": os.getenv("WASABI_SECRET_KEY"),
}

print("\nDirect S3 file inspection:")
print("\n1. Staging tracking file:")
try:
    with pd.io.common.get_handle(
        staging_tracking_path, mode="r", storage_options=storage_options
    ) as handles:
        print(json.dumps(json.load(handles.handle), indent=2))
except FileNotFoundError:
    print("  File not found (expected if no staging runs yet)")

print("\n2. DDS tracking files:")
for table in dds_tables:
    tracking_path = f"s3://{bucket_name}/dds/metadata/{sport}/{table}/processed_dates.json"
    print(f"\n  {table}:")
    try:
        with pd.io.common.get_handle(
            tracking_path, mode="r", storage_options=storage_options
        ) as handles:
            print(f"    {json.dumps(json.load(handles.handle), indent=2)}")
    except FileNotFoundError:
        print("    File not found (expected if no DDS runs yet)")


Direct S3 file inspection:

1. Staging tracking file:
[
  "2025-08-31",
  "2025-09-01",
  "2025-09-02",
  "2025-09-03",
  "2025-09-04",
  "2025-09-05",
  "2025-09-06",
  "2025-09-07",
  "2025-09-08",
  "2025-09-09",
  "2025-09-10",
  "2025-09-11",
  "2025-09-12",
  "2025-09-13",
  "2025-09-14",
  "2025-09-15",
  "2025-09-16",
  "2025-09-17",
  "2025-09-18",
  "2025-09-19",
  "2025-09-20",
  "2025-09-21",
  "2025-09-22",
  "2025-09-23",
  "2025-09-24",
  "2025-09-25",
  "2025-09-26",
  "2025-09-27",
  "2025-09-28",
  "2025-09-29",
  "2025-09-30",
  "2025-10-01",
  "2025-10-02",
  "2025-10-03",
  "2025-10-04",
  "2025-10-05",
  "2025-10-06",
  "2025-10-07",
  "2025-10-08"
]

2. DDS tracking files:

  players:
    [
  "2025-09-07"
]

  user_lineups:
    File not found (expected if no DDS runs yet)

  lineups:
    [
  "2025-09-07"
]


## 2. Test Wrapper Functions

Test each DDS processing function individually.

### 2.1 Test Players Processing

In [None]:
# Test players processing (requires staging data to exist)
try:
    print(f"Processing players for {sport} on {test_date}...")
    process_players_to_dds(date=test_date, sport=sport)
    print("✅ Players processing completed successfully")
except Exception as e:
    print(f"❌ Players processing failed: {e}")

Processing players for NFL on 2025-09-07...


### 2.2 Test Users/Lineups Processing

In [6]:
# Test users_lineups processing
try:
    print(f"Processing users_lineups for {sport} on {test_date}...")
    process_users_lineups_to_dds(date=test_date, sport=sport)
    print("✅ Users/lineups processing completed successfully")
except Exception as e:
    print(f"❌ Users/lineups processing failed: {e}")

Processing users_lineups for NFL on 2025-09-07...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Users/lineups processing completed successfully


### 2.3 Test Lineups Processing

In [None]:
# Test lineups processing
try:
    print(f"Processing lineups for {sport} on {test_date}...")
    process_lineups_to_dds(date=test_date, sport=sport)
    print("✅ Lineups processing completed successfully")
except Exception as e:
    print(f"❌ Lineups processing failed: {e}")

## 3. Test Tracking File Updates

Verify that marking dates as processed works correctly.

In [7]:
# Test marking a date as processed for each table
for table in dds_tables:
    tracking_path = f"s3://{bucket_name}/dds/metadata/{sport}/{table}/processed_dates.json"
    tracker = DateTracker(sport=sport, tracking_path=tracking_path)
    
    print(f"\n{table}:")
    print(f"  Before: {tracker.get_all_scraped()}")
    
    tracker.mark_scraped(test_date)
    print(f"  After marking {test_date}: {tracker.get_all_scraped()}")
    
    # Verify persistence (create new instance)
    new_tracker = DateTracker(sport=sport, tracking_path=tracking_path)
    print(f"  Persistence check: {new_tracker.get_all_scraped()}")
    print(f"  Is {test_date} in tracking? {new_tracker.is_scraped(test_date)}")


players:
  Before: ['2025-09-07']
  After marking 2025-09-07: ['2025-09-07']
  Persistence check: ['2025-09-07']
  Is 2025-09-07 in tracking? True

user_lineups:
  Before: []
  After marking 2025-09-07: ['2025-09-07']
  Persistence check: ['2025-09-07']
  Is 2025-09-07 in tracking? True

lineups:
  Before: ['2025-09-07']
  After marking 2025-09-07: ['2025-09-07']
  Persistence check: ['2025-09-07']
  Is 2025-09-07 in tracking? True


## 4. Verify S3 Output Structure

Check that DDS output files are created in the correct locations.

In [8]:
# List DDS output files for the test date
import s3fs

s3 = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": f"https://{os.getenv('WASABI_ENDPOINT', 's3.us-east-2.wasabisys.com')}"},
    key=os.getenv("WASABI_ACCESS_KEY"),
    secret=os.getenv("WASABI_SECRET_KEY"),
)

slate_types = ["dk_classic", "dk_single_game"]

print(f"\nDDS output files for {sport}/{test_date}:\n")

for table in dds_tables:
    print(f"\n{table.upper()}:")
    for slate_type in slate_types:
        path = f"{bucket_name}/dds/{sport}/{table}/{slate_type}/{test_date}/data.parquet"
        exists = s3.exists(path)
        if exists:
            size = s3.size(path)
            print(f"  ✅ {slate_type}: {size:,} bytes")
        else:
            print(f"  ❌ {slate_type}: NOT FOUND")


DDS output files for NFL/2025-09-07:


PLAYERS:
  ✅ dk_classic: 112,765 bytes
  ✅ dk_single_game: 11,668 bytes

USER_LINEUPS:
  ✅ dk_classic: 24,408,139 bytes
  ✅ dk_single_game: 1,953,455 bytes

LINEUPS:
  ✅ dk_classic: 80,029,270 bytes
  ✅ dk_single_game: 3,418,385 bytes


## 5. Summary

Review test results before deploying to Airflow.

In [None]:
print("""
✅ TESTS TO VERIFY:

1. DateTracker:
   - Accepts explicit tracking_path parameter ✓
   - Staging and DDS tracking files are separate ✓
   - Per-table tracking works independently ✓

2. Wrapper Functions:
   - process_players_to_dds() uses context manager ✓
   - process_users_lineups_to_dds() uses context manager ✓
   - process_lineups_to_dds() uses context manager ✓

3. Tracking Persistence:
   - Dates are saved to S3 correctly ✓
   - New tracker instances load existing dates ✓
   - Duplicate dates are handled gracefully ✓

4. S3 Output:
   - DDS files created in correct locations ✓
   - Parquet files are readable ✓

📋 READY FOR AIRFLOW TESTING:
   - Verify DAG appears in Airflow UI without errors
   - Manual trigger for specific date
   - Check all three tables process successfully
   - Test skip logic (re-run same date)
""")