In [22]:
pip install cfbd

Collecting cfbd
  Using cached cfbd-5.13.2-py3-none-any.whl.metadata (737 bytes)
Collecting pydantic<2,>=1.10.5 (from cfbd)
  Using cached pydantic-1.10.24-cp313-cp313-macosx_11_0_arm64.whl.metadata (154 kB)
Collecting aenum (from cfbd)
  Using cached aenum-3.1.16-py3-none-any.whl.metadata (3.8 kB)
Using cached cfbd-5.13.2-py3-none-any.whl (245 kB)
Using cached pydantic-1.10.24-cp313-cp313-macosx_11_0_arm64.whl (2.4 MB)
Using cached aenum-3.1.16-py3-none-any.whl (165 kB)
Installing collected packages: aenum, pydantic, cfbd
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [cfbd][32m2/3[0m [cfbd]tic]
[1A[2KSuccessfully installed aenum-3.1.16 cfbd-5.13.2 pydantic-1.10.24
Note: you may need to restart the kernel to use updated packages.


In [109]:
import cfbd
import pandas as pd
import numpy as np
import time

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("✓ Imports complete")

✓ Imports complete


In [110]:

class CompleteCFBDataPipeline:
    """
    Complete pipeline to download ALL CFB data and create one comprehensive dataset.
    One row per team per game with all available statistics.
    """
    
    def __init__(self, api_key):
        """Initialize with API key."""
        if not api_key or api_key == "YOUR_API_KEY_HERE":
            raise ValueError(
                "Invalid API key! Please set your actual CFBD API key.\n"
                "Get your key at: https://collegefootballdata.com/key"
            )
        
        self.api_key = api_key
        self.api_call_count = 0
        
        # Configure API
        configuration = cfbd.Configuration()
        configuration.access_token = api_key
        self.api_client = cfbd.ApiClient(configuration)
        
        # Initialize API instances
        self.games_api = cfbd.GamesApi(self.api_client)
        self.stats_api = cfbd.StatsApi(self.api_client)
        self.teams_api = cfbd.TeamsApi(self.api_client)
        self.ratings_api = cfbd.RatingsApi(self.api_client)
        self.recruiting_api = cfbd.RecruitingApi(self.api_client)
        self.betting_api = cfbd.BettingApi(self.api_client)
        self.metrics_api = cfbd.MetricsApi(self.api_client)
        self.plays_api = cfbd.PlaysApi(self.api_client)
        
        print("✓ CFB Data Pipeline initialized")
        print("\nTesting API connection...")
        self._test_api_connection()
    
    def _test_api_connection(self):
        """Test that API key is valid."""
        try:
            # Simple test call
            test = self.teams_api.get_fbs_teams(year=2024)
            print(f"✓ API connection successful! Found {len(test)} FBS teams for 2024")
        except Exception as e:
            if "401" in str(e) or "Unauthorized" in str(e):
                raise ValueError(
                    "\n" + "=" * 70 + "\n"
                    "ERROR: API Key is Invalid or Unauthorized!\n"
                    "=" * 70 + "\n"
                    "Please check:\n"
                    "1. Your API key is correct\n"
                    "2. You have an active subscription at collegefootballdata.com\n"
                    "3. Your API key is properly copied (no extra spaces)\n\n"
                    "Get your API key at: https://collegefootballdata.com/key\n"
                    "=" * 70
                )
            else:
                raise ValueError(f"API connection test failed: {str(e)}")
    
    def track_call(self, name):
        """Track API calls."""
        self.api_call_count += 1
        print(f"  API Call #{self.api_call_count}: {name}")
    
    def get_all_data(self, years=[2024], season_type='regular'):
        """
        Download ALL data and create one comprehensive dataset.
        
        Args:
            years: List of years to download (e.g., [2024] or list(range(2015, 2025)))
            season_type: 'regular', 'postseason', or 'both'
        
        Returns:
            DataFrame with one row per team per game with all stats
        """
        print("\n" + "=" * 70)
        print("COMPLETE CFB DATA DOWNLOAD")
        print("=" * 70)
        print(f"\nYears: {years}")
        print(f"Season type: {season_type}")
        print(f"\nThis will download:")
        print("  ✓ Game results & details")
        print("  ✓ Advanced game statistics (EPA, explosiveness, etc.)")
        print("  ✓ Play-by-play aggregated stats (sacks, turnovers, etc.)")
        print("  ✓ SP+ ratings")
        print("  ✓ FPI ratings")
        print("  ✓ Team talent composite")
        print("  ✓ Recruiting rankings")
        print("  ✓ Betting lines")
        print("  ✓ Pregame win probabilities")
        
        season_types = ['regular', 'postseason'] if season_type == 'both' else [season_type]
        
        # Download all components
        games_df = self._download_games(years, season_types)
        advanced_stats_df = self._download_advanced_stats(years, season_types)
        pbp_stats_df = self._download_play_stats(years, season_types)
        drive_stats_df = self._download_drive_stats(years, season_types)
        adjusted_metrics_df = self._download_adjusted_metrics(years)
        sp_ratings_df = self._download_sp_ratings(years)
        fpi_ratings_df = self._download_fpi_ratings(years)
        talent_df = self._download_talent(years)
        recruiting_df = self._download_recruiting(years)
        betting_df = self._download_betting(years, season_types)
        win_prob_df = self._download_win_probs(years, season_types)
        
        # Merge everything
        final_df = self._merge_all_data(
            games_df, advanced_stats_df, pbp_stats_df, drive_stats_df, 
            adjusted_metrics_df, sp_ratings_df, fpi_ratings_df, talent_df, 
            recruiting_df, betting_df, win_prob_df
        )
        
        print(f"\n" + "=" * 70)
        print("DOWNLOAD COMPLETE!")
        print("=" * 70)
        print(f"✓ Total API calls used: {self.api_call_count}")
        print(f"✓ Final dataset shape: {final_df.shape}")
        print(f"✓ Total games: {len(final_df) / 2:.0f}")
        print(f"✓ Total features: {len(final_df.columns)}")
        
        return final_df
    
    def _download_games(self, years, season_types):
        """Download basic game data."""
        print("\n" + "=" * 70)
        print("1. DOWNLOADING GAME RESULTS")
        print("=" * 70)
        
        all_games = []
        
        for year in years:
            for stype in season_types:
                self.track_call(f"get_games({year}, {stype})")
                games = self.games_api.get_games(year=year, season_type=stype)
                
                for g in games:
                    all_games.append({
                        'game_id': g.id,
                        'season': g.season,
                        'week': g.week,
                        'season_type': g.season_type,
                        'start_date': g.start_date,
                        'neutral_site': g.neutral_site,
                        'conference_game': g.conference_game,
                        'attendance': g.attendance,
                        'venue_id': g.venue_id,
                        'venue': g.venue,
                        'home_id': g.home_id,
                        'home_team': g.home_team,
                        'home_conference': g.home_conference,
                        'home_points': g.home_points,
                        'away_id': g.away_id,
                        'away_team': g.away_team,
                        'away_conference': g.away_conference,
                        'away_points': g.away_points,
                        'excitement_index': g.excitement_index
                    })
                
                print(f"  {year} {stype}: {len(games)} games")
                time.sleep(0.1)
        
        df = pd.DataFrame(all_games)
        print(f"\n✓ Downloaded {len(df)} games")
        return df
    
    def _download_advanced_stats(self, years, season_types):
        """Download advanced game statistics."""
        print("\n" + "=" * 70)
        print("2. DOWNLOADING ADVANCED GAME STATS")
        print("=" * 70)
        
        all_stats = []
        
        for year in years:
            for stype in season_types:
                self.track_call(f"get_advanced_game_stats({year}, {stype})")
                stats = self.stats_api.get_advanced_game_stats(year=year, season_type=stype)
                
                for s in stats:
                    data = {
                        'game_id': s.game_id,
                        'team': s.team,
                        'opponent': s.opponent
                    }
                    
                    # Flatten offense
                    if s.offense:
                        data['off_plays'] = s.offense.plays
                        data['off_drives'] = s.offense.drives
                        data['off_ppa'] = s.offense.ppa
                        data['off_total_ppa'] = s.offense.total_ppa
                        data['off_success_rate'] = s.offense.success_rate
                        data['off_explosiveness'] = s.offense.explosiveness
                        data['off_power_success'] = s.offense.power_success
                        data['off_stuff_rate'] = s.offense.stuff_rate
                        data['off_line_yards'] = s.offense.line_yards
                        data['off_line_yards_total'] = s.offense.line_yards_total
                        data['off_second_level_yards'] = s.offense.second_level_yards
                        data['off_second_level_yards_total'] = s.offense.second_level_yards_total
                        data['off_open_field_yards'] = s.offense.open_field_yards
                        data['off_open_field_yards_total'] = s.offense.open_field_yards_total
                        
                        # Passing
                        if s.offense.passing_plays:
                            data['off_pass_ppa'] = s.offense.passing_plays.ppa
                            data['off_pass_total_ppa'] = s.offense.passing_plays.total_ppa
                            data['off_pass_success_rate'] = s.offense.passing_plays.success_rate
                            data['off_pass_explosiveness'] = s.offense.passing_plays.explosiveness
                        
                        # Rushing
                        if s.offense.rushing_plays:
                            data['off_rush_ppa'] = s.offense.rushing_plays.ppa
                            data['off_rush_total_ppa'] = s.offense.rushing_plays.total_ppa
                            data['off_rush_success_rate'] = s.offense.rushing_plays.success_rate
                            data['off_rush_explosiveness'] = s.offense.rushing_plays.explosiveness
                        
                        # Standard downs
                        if s.offense.standard_downs:
                            data['off_standard_downs_ppa'] = s.offense.standard_downs.ppa
                            data['off_standard_downs_success_rate'] = s.offense.standard_downs.success_rate
                            data['off_standard_downs_explosiveness'] = s.offense.standard_downs.explosiveness
                        
                        # Passing downs
                        if s.offense.passing_downs:
                            data['off_passing_downs_ppa'] = s.offense.passing_downs.ppa
                            data['off_passing_downs_success_rate'] = s.offense.passing_downs.success_rate
                            data['off_passing_downs_explosiveness'] = s.offense.passing_downs.explosiveness
                    
                    # Flatten defense
                    if s.defense:
                        data['def_plays'] = s.defense.plays
                        data['def_drives'] = s.defense.drives
                        data['def_ppa'] = s.defense.ppa
                        data['def_total_ppa'] = s.defense.total_ppa
                        data['def_success_rate'] = s.defense.success_rate
                        data['def_explosiveness'] = s.defense.explosiveness
                        data['def_power_success'] = s.defense.power_success
                        data['def_stuff_rate'] = s.defense.stuff_rate
                        data['def_line_yards'] = s.defense.line_yards
                        data['def_line_yards_total'] = s.defense.line_yards_total
                        data['def_second_level_yards'] = s.defense.second_level_yards
                        data['def_second_level_yards_total'] = s.defense.second_level_yards_total
                        data['def_open_field_yards'] = s.defense.open_field_yards
                        data['def_open_field_yards_total'] = s.defense.open_field_yards_total
                        
                        if s.defense.passing_plays:
                            data['def_pass_ppa'] = s.defense.passing_plays.ppa
                            data['def_pass_total_ppa'] = s.defense.passing_plays.total_ppa
                            data['def_pass_success_rate'] = s.defense.passing_plays.success_rate
                            data['def_pass_explosiveness'] = s.defense.passing_plays.explosiveness
                        
                        if s.defense.rushing_plays:
                            data['def_rush_ppa'] = s.defense.rushing_plays.ppa
                            data['def_rush_total_ppa'] = s.defense.rushing_plays.total_ppa
                            data['def_rush_success_rate'] = s.defense.rushing_plays.success_rate
                            data['def_rush_explosiveness'] = s.defense.rushing_plays.explosiveness
                        
                        if s.defense.standard_downs:
                            data['def_standard_downs_ppa'] = s.defense.standard_downs.ppa
                            data['def_standard_downs_success_rate'] = s.defense.standard_downs.success_rate
                            data['def_standard_downs_explosiveness'] = s.defense.standard_downs.explosiveness
                        
                        if s.defense.passing_downs:
                            data['def_passing_downs_ppa'] = s.defense.passing_downs.ppa
                            data['def_passing_downs_success_rate'] = s.defense.passing_downs.success_rate
                            data['def_passing_downs_explosiveness'] = s.defense.passing_downs.explosiveness
                    
                    all_stats.append(data)
                
                print(f"  {year} {stype}: {len(stats)} team-games")
                time.sleep(0.1)
        
        df = pd.DataFrame(all_stats)
        print(f"\n✓ Downloaded {len(df)} advanced stat records")
        return df
    
    def _download_play_stats(self, years, season_types):
        """Download individual GAME team stats (passing yards, rushing yards, sacks, turnovers per game)."""
        print("\n" + "=" * 70)
        print("3. DOWNLOADING GAME-LEVEL TEAM STATS (Passing, Rushing, Sacks, Turnovers)")
        print("=" * 70)
        
        all_stats = []
        
        for year in years:
            for stype in season_types:
                # Need to loop through weeks (1-16 for regular season)
                max_week = 16 if stype == 'regular' else 5
                
                for week in range(1, max_week + 1):
                    self.track_call(f"get_game_team_stats({year}, week={week}, {stype})")
                    
                    try:
                        game_stats = self.games_api.get_game_team_stats(
                            year=year,
                            week=week,
                            season_type=stype
                        )
                        
                        # Each game_stat is a GameTeamStats object
                        for game in game_stats:
                            # game.teams is a list of GameTeamStatsTeam objects (one per team)
                            if game.teams:
                                for team_data in game.teams:
                                    stat_dict = {
                                        'game_id': game.id,
                                        'team': team_data.team,
                                        'conference': team_data.conference,
                                        'home_away': team_data.home_away,
                                        'points': team_data.points
                                    }
                                    
                                    # Extract all stats from the stats list
                                    if team_data.stats:
                                        for stat in team_data.stats:
                                            # stat.category is like "totalYards", stat.stat is the value
                                            stat_dict[f'game_{stat.category}'] = stat.stat
                                    
                                    all_stats.append(stat_dict)
                        
                        if len(game_stats) > 0:
                            print(f"  {year} {stype} Week {week}: {len(game_stats)} games ({len(game_stats)*2} team-games)")
                        
                    except Exception as e:
                        if "404" not in str(e):  # Don't print for weeks that don't exist
                            print(f"  {year} {stype} Week {week}: Error - {str(e)}")
                    
                    time.sleep(0.1)
        
        if len(all_stats) == 0:
            print(f"\n⚠ No game team stats available")
            return pd.DataFrame()
        
        df = pd.DataFrame(all_stats)
        
        # Drop unnecessary columns for merging
        df = df.drop(columns=['conference', 'home_away', 'points'], errors='ignore')
        
        # Show what stats are available
        stat_cols = [c for c in df.columns if c.startswith('game_')]
        print(f"\n✓ Downloaded individual game team stats")
        print(f"  Team-games: {len(df)}")
        print(f"  Stat categories: {len(stat_cols)}")
        print(f"\n  Sample stats available:")
        for col in sorted(stat_cols)[:25]:
            print(f"    • {col}")
        if len(stat_cols) > 25:
            print(f"    ... and {len(stat_cols) - 25} more")
        
        return df
    
    def _download_drive_stats(self, years, season_types):
        """Download drive-level statistics for field position and drive outcomes."""
        print("\n" + "=" * 70)
        print("4. DOWNLOADING DRIVE-LEVEL DATA (Field Position, Drive Results)")
        print("=" * 70)
        
        drives_api = cfbd.DrivesApi(self.api_client)
        all_drives = []
        
        for year in years:
            for stype in season_types:
                self.track_call(f"get_drives({year}, {stype})")
                
                try:
                    drives = drives_api.get_drives(
                        year=year,
                        season_type=stype
                    )
                    
                    for drive in drives:
                        all_drives.append({
                            'game_id': int(drive.game_id) if hasattr(drive, 'game_id') and drive.game_id else None,  # Use game_id, not id
                            'team': drive.offense,
                            'opponent': drive.defense,
                            'drive_number': drive.drive_number,
                            'start_period': drive.start_period,
                            'start_yardline': drive.start_yardline,
                            'start_yards_to_goal': drive.start_yards_to_goal,
                            'end_period': drive.end_period,
                            'end_yardline': drive.end_yardline,
                            'end_yards_to_goal': drive.end_yards_to_goal,
                            'plays': drive.plays,
                            'yards': drive.yards,
                            'drive_result': drive.drive_result,
                            'elapsed_time': drive.elapsed
                        })
                    
                    print(f"  {year} {stype}: {len(drives)} drives")
                    time.sleep(0.1)
                    
                except Exception as e:
                    print(f"  {year} {stype}: Error - {str(e)}")
        
        if len(all_drives) == 0:
            print(f"\n⚠ No drive data available")
            return pd.DataFrame()
        
        df = pd.DataFrame(all_drives)
        
        # Ensure game_id is integer type
        df['game_id'] = pd.to_numeric(df['game_id'], errors='coerce').astype('Int64')
        
        # Remove rows with null game_id
        df = df[df['game_id'].notna()]
        
        print(f"\nTotal drives downloaded: {len(df)}")
        print(f"Unique games: {df['game_id'].nunique()}")
        print(f"Unique teams: {df['team'].nunique()}")
        
        # Aggregate drive data by game and team
        print(f"Aggregating drive data by game and team...")
        
        # Calculate drive success metrics first (before aggregation)
        df['reached_redzone'] = df['end_yards_to_goal'] <= 20
        df['reached_scoring_zone'] = df['end_yards_to_goal'] <= 40
        df['reached_plus_territory'] = df['end_yards_to_goal'] <= 50
        df['successful_drive'] = df['drive_result'].isin(['TD', 'FG', 'TOUCHDOWN', 'FIELD GOAL'])
        df['scored_td'] = df['drive_result'].isin(['TD', 'TOUCHDOWN'])
        
        # Now aggregate by game_id and team
        drive_agg = df.groupby(['game_id', 'team'], as_index=False).agg({
            # Field position
            'start_yards_to_goal': 'mean',
            
            # Drive outcomes  
            'scored_td': 'sum',
            'plays': 'mean',
            'yards': 'mean',
            'drive_number': 'count',
            
            # Success metrics
            'reached_redzone': 'sum',
            'reached_scoring_zone': 'sum',
            'reached_plus_territory': 'sum',
            'successful_drive': 'sum'
        })
        
        # Rename columns
        drive_agg.columns = [
            'game_id', 'team',
            'avg_start_field_position',
            'touchdowns_scored',
            'avg_plays_per_drive',
            'avg_yards_per_drive',
            'total_drives',
            'drives_to_redzone',
            'drives_to_scoring_zone',
            'drives_to_plus_territory',
            'successful_drives'
        ]
        
        # Calculate rates
        drive_agg['scoring_drive_rate'] = drive_agg['touchdowns_scored'] / drive_agg['total_drives']
        drive_agg['redzone_rate'] = drive_agg['drives_to_redzone'] / drive_agg['total_drives']
        drive_agg['scoring_zone_rate'] = drive_agg['drives_to_scoring_zone'] / drive_agg['total_drives']
        drive_agg['plus_territory_rate'] = drive_agg['drives_to_plus_territory'] / drive_agg['total_drives']
        drive_agg['drive_success_rate'] = drive_agg['successful_drives'] / drive_agg['total_drives']
        
        # Ensure game_id is integer type in final output
        drive_agg['game_id'] = drive_agg['game_id'].astype('Int64')
        
        print(f"\n✓ Aggregated drive data")
        print(f"  Team-games: {len(drive_agg)}")
        print(f"  Games covered: {drive_agg['game_id'].nunique()}")
        print(f"  Teams covered: {drive_agg['team'].nunique()}")
        print(f"  Metrics: {len(drive_agg.columns) - 2}")
        print(f"\n  Sample game_ids: {drive_agg['game_id'].head(3).tolist()}")
        print(f"  Sample teams: {drive_agg['team'].head(3).tolist()}")
        
        return drive_agg
    
    def _download_adjusted_metrics(self, years):
        """Download opponent-adjusted team season metrics (success rates)."""
        print("\n" + "=" * 70)
        print("5. DOWNLOADING ADJUSTED METRICS (Opponent-Adjusted Success Rates)")
        print("=" * 70)
        
        all_metrics = []
        
        for year in years:
            self.track_call(f"get_adjusted_team_season_stats({year})")
            
            try:
                # Use the adjusted_metrics API endpoint
                metrics = self.stats_api.get_advanced_team_season_stats(
                    year=year,
                    exclude_garbage_time=True  # Exclude garbage time for better metrics
                )
                
                for m in metrics:
                    data = {
                        'year': m.season,
                        'team': m.team,
                        'conference': m.conference
                    }
                    
                    # Extract offense metrics
                    if m.offense:
                        data['adj_off_success_rate'] = m.offense.success_rate if hasattr(m.offense, 'success_rate') else None
                        data['adj_off_explosiveness'] = m.offense.explosiveness if hasattr(m.offense, 'explosiveness') else None
                        data['adj_off_ppa'] = m.offense.ppa if hasattr(m.offense, 'ppa') else None
                    
                    # Extract defense metrics  
                    if m.defense:
                        data['adj_def_success_rate'] = m.defense.success_rate if hasattr(m.defense, 'success_rate') else None
                        data['adj_def_explosiveness'] = m.defense.explosiveness if hasattr(m.defense, 'explosiveness') else None
                        data['adj_def_ppa'] = m.defense.ppa if hasattr(m.defense, 'ppa') else None
                    
                    all_metrics.append(data)
                
                print(f"  {year}: {len(metrics)} teams")
                time.sleep(0.1)
                
            except Exception as e:
                print(f"  {year}: Error - {str(e)}")
        
        if len(all_metrics) == 0:
            print(f"\n⚠ No adjusted metrics available")
            return pd.DataFrame()
        
        df = pd.DataFrame(all_metrics)
        print(f"\n✓ Downloaded adjusted metrics for {len(df)} teams")
        return df
    
    def _download_sp_ratings(self, years):
        """Download SP+ ratings."""
        print("\n" + "=" * 70)
        print("4. DOWNLOADING SP+ RATINGS")
        print("=" * 70)
        
        all_ratings = []
        
        for year in years:
            self.track_call(f"get_sp({year})")
            ratings = self.ratings_api.get_sp(year=year)
            
            for r in ratings:
                data = {
                    'year': r.year,
                    'team': r.team,
                    'sp_rating': r.rating,
                    'sp_ranking': r.ranking
                }
                
                if r.offense:
                    data['sp_offense'] = r.offense.rating
                    data['sp_offense_ranking'] = r.offense.ranking
                
                if r.defense:
                    data['sp_defense'] = r.defense.rating
                    data['sp_defense_ranking'] = r.defense.ranking
                
                if r.special_teams:
                    data['sp_special_teams'] = r.special_teams.rating
                
                all_ratings.append(data)
            
            print(f"  {year}: {len(ratings)} teams")
            time.sleep(0.1)
        
        df = pd.DataFrame(all_ratings)
        print(f"\n✓ Downloaded {len(df)} SP+ ratings")
        return df
    
    def _download_fpi_ratings(self, years):
        """Download FPI ratings."""
        print("\n" + "=" * 70)
        print("5. DOWNLOADING FPI RATINGS")
        print("=" * 70)
        
        all_ratings = []
        
        for year in years:
            self.track_call(f"get_fpi({year})")
            
            try:
                ratings = self.ratings_api.get_fpi(year=year)
                
                for r in ratings:
                    data = {
                        'year': r.year,
                        'team': r.team,
                        'fpi': r.fpi
                    }
                    
                    if r.efficiencies:
                        data['fpi_offense'] = r.efficiencies.offense
                        data['fpi_defense'] = r.efficiencies.defense
                        data['fpi_special_teams'] = r.efficiencies.special_teams
                    
                    all_ratings.append(data)
                
                print(f"  {year}: {len(ratings)} teams")
            except Exception as e:
                print(f"  {year}: Not available")
            
            time.sleep(0.1)
        
        df = pd.DataFrame(all_ratings)
        if len(df) > 0:
            print(f"\n✓ Downloaded {len(df)} FPI ratings")
        else:
            print(f"\n⚠ No FPI ratings available")
        return df
    
    def _download_talent(self, years):
        """Download team talent composite."""
        print("\n" + "=" * 70)
        print("6. DOWNLOADING TEAM TALENT RATINGS")
        print("=" * 70)
        
        all_talent = []
        
        for year in years:
            self.track_call(f"get_talent({year})")
            
            try:
                talent = self.teams_api.get_talent(year=year)
                
                for t in talent:
                    all_talent.append({
                        'year': t.year,
                        'team': t.school,
                        'talent': t.talent
                    })
                
                print(f"  {year}: {len(talent)} teams")
            except Exception as e:
                print(f"  {year}: Not available")
            
            time.sleep(0.1)
        
        df = pd.DataFrame(all_talent)
        if len(df) > 0:
            print(f"\n✓ Downloaded {len(df)} talent ratings")
        else:
            print(f"\n⚠ No talent ratings available")
        return df
    
    def _download_recruiting(self, years):
        """Download recruiting rankings."""
        print("\n" + "=" * 70)
        print("7. DOWNLOADING RECRUITING RANKINGS")
        print("=" * 70)
        
        all_recruiting = []
        
        for year in years:
            self.track_call(f"get_team_recruiting_rankings({year})")
            rankings = self.recruiting_api.get_team_recruiting_rankings(year=year)
            
            for r in rankings:
                all_recruiting.append({
                    'year': r.year,
                    'team': r.team,
                    'recruiting_rank': r.rank,
                    'recruiting_points': r.points
                })
            
            print(f"  {year}: {len(rankings)} teams")
            time.sleep(0.1)
        
        df = pd.DataFrame(all_recruiting)
        print(f"\n✓ Downloaded {len(df)} recruiting rankings")
        return df
    
    def _download_betting(self, years, season_types):
        """Download betting lines."""
        print("\n" + "=" * 70)
        print("8. DOWNLOADING BETTING LINES")
        print("=" * 70)
        
        all_lines = []
        
        for year in years:
            for stype in season_types:
                self.track_call(f"get_lines({year}, {stype})")
                lines = self.betting_api.get_lines(year=year, season_type=stype)
                
                for game in lines:
                    if game.lines:
                        # Average across all providers
                        spreads = [l.spread for l in game.lines if l.spread is not None]
                        totals = [l.over_under for l in game.lines if l.over_under is not None]
                        
                        all_lines.append({
                            'game_id': game.id,
                            'betting_spread': np.mean(spreads) if spreads else None,
                            'betting_total': np.mean(totals) if totals else None
                        })
                
                print(f"  {year} {stype}: {len(lines)} games")
                time.sleep(0.1)
        
        df = pd.DataFrame(all_lines)
        print(f"\n✓ Downloaded betting lines for {len(df)} games")
        return df
    
    def _download_win_probs(self, years, season_types):
        """Download pregame win probabilities."""
        print("\n" + "=" * 70)
        print("9. DOWNLOADING PREGAME WIN PROBABILITIES")
        print("=" * 70)
        
        all_probs = []
        
        for year in years:
            for stype in season_types:
                self.track_call(f"get_pregame_win_probabilities({year}, {stype})")
                
                try:
                    probs = self.metrics_api.get_pregame_win_probabilities(
                        year=year, season_type=stype
                    )
                    
                    for p in probs:
                        all_probs.append({
                            'game_id': p.game_id,
                            'home_win_prob': p.home_win_prob,
                            'spread': p.spread
                        })
                    
                    print(f"  {year} {stype}: {len(probs)} games")
                except Exception as e:
                    print(f"  {year} {stype}: Not available")
                
                time.sleep(0.1)
        
        df = pd.DataFrame(all_probs)
        if len(df) > 0:
            print(f"\n✓ Downloaded win probabilities for {len(df)} games")
        else:
            print(f"\n⚠ No win probabilities available")
        return df
    
    def _merge_all_data(self, games_df, advanced_stats_df, pbp_stats_df, drive_stats_df,
                       adjusted_metrics_df, sp_ratings_df, fpi_ratings_df, talent_df, 
                       recruiting_df, betting_df, win_prob_df):
        """Merge all datasets into one comprehensive dataset."""
        print("\n" + "=" * 70)
        print("MERGING ALL DATA")
        print("=" * 70)
        
        # Create base dataset (2 rows per game - home and away)
        print("\n1. Creating base dataset...")
        
        home_df = games_df.copy()
        home_df['team'] = home_df['home_team']
        home_df['opponent'] = home_df['away_team']
        home_df['team_points'] = home_df['home_points']
        home_df['opponent_points'] = home_df['away_points']
        home_df['is_home'] = 1
        
        away_df = games_df.copy()
        away_df['team'] = away_df['away_team']
        away_df['opponent'] = away_df['home_team']
        away_df['team_points'] = away_df['away_points']
        away_df['opponent_points'] = away_df['home_points']
        away_df['is_home'] = 0
        
        df = pd.concat([home_df, away_df], ignore_index=True)
        
        # Add derived columns
        df['point_differential'] = df['team_points'] - df['opponent_points']
        df['win'] = (df['point_differential'] > 0).astype(int)
        df['date'] = pd.to_datetime(df['start_date'])
        df['month'] = df['date'].dt.month
        df['day_of_week'] = df['date'].dt.dayofweek
        
        print(f"  Base: {len(df)} team-game records")
        
        # Merge advanced stats
        print("\n2. Merging advanced stats...")
        if len(advanced_stats_df) > 0:
            # Drop opponent column to avoid conflicts
            advanced_stats_df = advanced_stats_df.drop(columns=['opponent'], errors='ignore')
            df = df.merge(advanced_stats_df, on=['game_id', 'team'], how='left')
            print(f"  ✓ Added {len([c for c in advanced_stats_df.columns if c not in ['game_id', 'team']])} columns")
        
        # Merge play-by-play stats
        print("\n3. Merging individual game stats...")
        if len(pbp_stats_df) > 0:
            # These are game-level stats, merge on game_id and team
            pbp_stats_df = pbp_stats_df.drop(columns=['opponent'], errors='ignore')
            df = df.merge(pbp_stats_df, on=['game_id', 'team'], how='left')
            
            stat_cols = [c for c in pbp_stats_df.columns if c.startswith('game_')]
            print(f"  ✓ Added {len(stat_cols)} game stat columns")
        
        # Merge drive stats
        print("\n4. Merging drive-level stats...")
        if len(drive_stats_df) > 0:
            # Print diagnostic info
            print(f"  Drive stats before merge: {len(drive_stats_df)} rows")
            print(f"  Base data: {len(df)} rows")
            print(f"  Sample drive teams: {drive_stats_df['team'].head(3).tolist()}")
            print(f"  Sample base teams: {df['team'].head(3).tolist()}")
            
            df = df.merge(drive_stats_df, on=['game_id', 'team'], how='left')
            
            drive_cols = [c for c in drive_stats_df.columns if c not in ['game_id', 'team']]
            
            # Check how many merged successfully
            merged_count = df[drive_cols[0]].notna().sum() if len(drive_cols) > 0 else 0
            print(f"  ✓ Added {len(drive_cols)} drive stat columns")
            print(f"  ✓ Matched {merged_count}/{len(df)} rows")
        
        # Merge adjusted metrics
        print("\n5. Merging adjusted metrics...")
        if len(adjusted_metrics_df) > 0:
            # Team adjusted metrics
            team_adj = adjusted_metrics_df.drop(columns=['year', 'conference'], errors='ignore').copy()
            team_adj.columns = ['team'] + [f'team_{col}' for col in team_adj.columns[1:]]
            df = df.merge(team_adj, on='team', how='left')
            
            # Opponent adjusted metrics
            opp_adj = adjusted_metrics_df.drop(columns=['year', 'conference'], errors='ignore').copy()
            opp_adj.columns = ['opponent'] + [f'opp_{col}' for col in opp_adj.columns[1:]]
            df = df.merge(opp_adj, on='opponent', how='left')
            
            print(f"  ✓ Added opponent-adjusted success rates")
        
        # Merge SP+ ratings
        print("\n6. Merging SP+ ratings...")
        if len(sp_ratings_df) > 0:
            # Team SP+
            team_sp = sp_ratings_df.drop(columns=['year'], errors='ignore').copy()
            team_sp.columns = ['team'] + [f'team_{col}' for col in team_sp.columns[1:]]
            df = df.merge(team_sp, on='team', how='left')
            
            # Opponent SP+
            opp_sp = sp_ratings_df.drop(columns=['year'], errors='ignore').copy()
            opp_sp.columns = ['opponent'] + [f'opp_{col}' for col in opp_sp.columns[1:]]
            df = df.merge(opp_sp, on='opponent', how='left')
            
            # SP+ differential
            if 'team_sp_rating' in df.columns and 'opp_sp_rating' in df.columns:
                df['sp_rating_diff'] = df['team_sp_rating'] - df['opp_sp_rating']
            
            print(f"  ✓ Added SP+ ratings")
        
        # Merge FPI ratings
        print("\n7. Merging FPI ratings...")
        if len(fpi_ratings_df) > 0:
            team_fpi = fpi_ratings_df.drop(columns=['year'], errors='ignore').copy()
            team_fpi.columns = ['team'] + [f'team_{col}' for col in team_fpi.columns[1:]]
            df = df.merge(team_fpi, on='team', how='left')
            
            opp_fpi = fpi_ratings_df.drop(columns=['year'], errors='ignore').copy()
            opp_fpi.columns = ['opponent'] + [f'opp_{col}' for col in opp_fpi.columns[1:]]
            df = df.merge(opp_fpi, on='opponent', how='left')
            
            print(f"  ✓ Added FPI ratings")
        
        # Merge talent
        print("\n8. Merging talent ratings...")
        if len(talent_df) > 0:
            team_talent = talent_df.drop(columns=['year'], errors='ignore').copy()
            team_talent.columns = ['team', 'team_talent']
            df = df.merge(team_talent, on='team', how='left')
            
            opp_talent = talent_df.drop(columns=['year'], errors='ignore').copy()
            opp_talent.columns = ['opponent', 'opp_talent']
            df = df.merge(opp_talent, on='opponent', how='left')
            
            print(f"  ✓ Added talent ratings")
        
        # Merge recruiting
        print("\n9. Merging recruiting rankings...")
        if len(recruiting_df) > 0:
            team_rec = recruiting_df.drop(columns=['year'], errors='ignore').copy()
            team_rec.columns = ['team'] + [f'team_{col}' for col in team_rec.columns[1:]]
            df = df.merge(team_rec, on='team', how='left')
            
            opp_rec = recruiting_df.drop(columns=['year'], errors='ignore').copy()
            opp_rec.columns = ['opponent'] + [f'opp_{col}' for col in opp_rec.columns[1:]]
            df = df.merge(opp_rec, on='opponent', how='left')
            
            if 'team_recruiting_rank' in df.columns and 'opp_recruiting_rank' in df.columns:
                df['recruiting_rank_diff'] = df['opp_recruiting_rank'] - df['team_recruiting_rank']
            
            print(f"  ✓ Added recruiting rankings")
        
        # Merge betting lines
        print("\n10. Merging betting lines...")
        if len(betting_df) > 0:
            df = df.merge(betting_df, on='game_id', how='left')
            
            # Adjust spread for away teams
            df['team_spread'] = np.where(
                df['is_home'] == 1,
                df['betting_spread'],
                -df['betting_spread']
            )
            
            # Implied win probability from spread
            df['implied_win_prob'] = 1 / (1 + np.exp(df['team_spread'] * 0.04))
            
            print(f"  ✓ Added betting lines")
        
        # Merge win probabilities
        print("\n11. Merging win probabilities...")
        if len(win_prob_df) > 0:
            df = df.merge(win_prob_df[['game_id', 'home_win_prob']], on='game_id', how='left')
            
            # Adjust for away teams
            df['pregame_win_prob'] = np.where(
                df['is_home'] == 1,
                df['home_win_prob'],
                1 - df['home_win_prob']
            )
            df = df.drop(columns=['home_win_prob'], errors='ignore')
            
            print(f"  ✓ Added win probabilities")
        
        # Sort by date
        df = df.sort_values(['season', 'week', 'date']).reset_index(drop=True)
        
        print(f"\n✓ Merge complete!")
        print(f"  Final shape: {df.shape}")
        
        return df


# ============================================================
# USAGE
# ============================================================

def run_pipeline(api_key, years=[2024], season_type='regular', save_to_csv=True):
    """
    Run the complete pipeline.
    
    Args:
        api_key: Your CFBD API key
        years: List of years (e.g., [2024] or list(range(2015, 2025)))
        season_type: 'regular', 'postseason', or 'both'
        save_to_csv: Whether to save output to CSV
    
    Returns:
        Complete DataFrame
    """
    pipeline = CompleteCFBDataPipeline(api_key)
    
    df = pipeline.get_all_data(years=years, season_type=season_type)
    
    # Show summary
    print("\n" + "=" * 70)
    print("FINAL DATASET SUMMARY")
    print("=" * 70)
    
    print(f"\nDataset shape: {df.shape}")
    print(f"  Rows (team-games): {len(df)}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Actual games: {len(df) / 2:.0f}")
    print(f"  Seasons: {sorted(df['season'].unique())}")
    print(f"  Weeks: {sorted(df['week'].unique())}")
    print(f"  Teams: {df['team'].nunique()}")
    
    print("\n" + "Feature breakdown:")
    
    # Count feature types
    game_info = ['game_id', 'season', 'week', 'date', 'team', 'opponent', 'venue', 'month', 'day_of_week']
    outcomes = ['team_points', 'opponent_points', 'point_differential', 'win']
    context = ['is_home', 'neutral_site', 'conference_game', 'attendance']
    adv_stats = [c for c in df.columns if c.startswith('off_') or c.startswith('def_')]
    game_stats = [c for c in df.columns if c.startswith('game_')]
    drive_stats = [c for c in df.columns if any(x in c for x in ['drive', 'redzone', 'scoring_zone', 'field_position'])]
    adjusted_metrics = [c for c in df.columns if 'adj_' in c]
    ratings = [c for c in df.columns if any(x in c for x in ['sp_', 'fpi', 'talent'])]
    recruiting = [c for c in df.columns if 'recruiting' in c]
    betting = [c for c in df.columns if any(x in c for x in ['spread', 'total', 'prob'])]
    
    print(f"  Game info: {len([c for c in game_info if c in df.columns])} columns")
    print(f"  Outcomes: {len([c for c in outcomes if c in df.columns])} columns")
    print(f"  Context: {len([c for c in context if c in df.columns])} columns")
    print(f"  Advanced stats (EPA, success rates): {len(adv_stats)} columns")
    print(f"  Game stats (passing yards, rushing yards, sacks, etc.): {len(game_stats)} columns")
    print(f"  Drive stats (field position, drive success): {len(drive_stats)} columns")
    print(f"  Adjusted metrics (opponent-adjusted): {len(adjusted_metrics)} columns")
    print(f"  Ratings (SP+, FPI, Talent): {len(ratings)} columns")
    print(f"  Recruiting: {len(recruiting)} columns")
    print(f"  Betting lines: {len(betting)} columns")
    
    # Show sample
    print("\n" + "Sample data (first 5 rows):")
    print("=" * 70)
    sample_cols = ['team', 'opponent', 'week', 'team_points', 'opponent_points', 
                   'win', 'is_home', 'team_sp_rating', 'opp_sp_rating']
    sample_cols = [c for c in sample_cols if c in df.columns]
    print(df[sample_cols].head())
    
    # Show what play-by-play stats are available
    if len(game_stats) > 0:
        print("\n" + "Available individual game statistics:")
        print("=" * 70)
        for stat in sorted(game_stats)[:40]:  # Show first 40
            print(f"  • {stat}")
        if len(game_stats) > 40:
            print(f"  ... and {len(game_stats) - 40} more")
    
    # Check for missing data
    print("\n" + "Data completeness:")
    print("=" * 70)
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).sort_values(ascending=False)
    high_missing = missing_pct[missing_pct > 10].head(10)
    
    if len(high_missing) > 0:
        print("  Columns with >10% missing:")
        for col, pct in high_missing.items():
            print(f"    • {col}: {pct:.1f}% missing")
    else:
        print("  ✓ All major columns have <10% missing data")
    
    # Save to CSV
    if save_to_csv:
        filename = f"cfb_complete_data_{min(years)}_to_{max(years)}.csv"
        df.to_csv(filename, index=False)
        print(f"\n✓ Saved to {filename}")
    
    return df



In [111]:
# Replace with your actual API key
API_KEY = "OPHcuzA3Pay3vlBQ+FjanV5j0/XHbJmYCrYF5tPW5yKLFXmpbzs0Ug5BbKwQATJ6"

print("✓ API key set")

✓ API key set


In [112]:
# This will test your API connection
pipeline = CompleteCFBDataPipeline(api_key=API_KEY)

✓ CFB Data Pipeline initialized

Testing API connection...
✓ API connection successful! Found 134 FBS teams for 2024


In [117]:
df = run_pipeline(
    api_key=API_KEY,
    years=[2024],
    season_type='regular',
    save_to_csv=True
)

✓ CFB Data Pipeline initialized

Testing API connection...
✓ API connection successful! Found 134 FBS teams for 2024

COMPLETE CFB DATA DOWNLOAD

Years: [2024]
Season type: regular

This will download:
  ✓ Game results & details
  ✓ Advanced game statistics (EPA, explosiveness, etc.)
  ✓ Play-by-play aggregated stats (sacks, turnovers, etc.)
  ✓ SP+ ratings
  ✓ FPI ratings
  ✓ Team talent composite
  ✓ Recruiting rankings
  ✓ Betting lines
  ✓ Pregame win probabilities

1. DOWNLOADING GAME RESULTS
  API Call #1: get_games(2024, regular)
  2024 regular: 3747 games

✓ Downloaded 3747 games

2. DOWNLOADING ADVANCED GAME STATS
  API Call #2: get_advanced_game_stats(2024, regular)
  2024 regular: 3114 team-games

✓ Downloaded 3114 advanced stat records

3. DOWNLOADING GAME-LEVEL TEAM STATS (Passing, Rushing, Sacks, Turnovers)
  API Call #3: get_game_team_stats(2024, week=1, regular)
  2024 regular Week 1: 137 games (274 team-games)
  API Call #4: get_game_team_stats(2024, week=2, regular)
 

In [130]:
print(df.head())

     game_id  season  week season_type                 start_date  \
0  401693677    2024     1     regular  2024-08-24 04:00:00+00:00   
1  401693677    2024     1     regular  2024-08-24 04:00:00+00:00   
2  401635525    2024     1     regular  2024-08-24 16:00:00+00:00   
3  401635525    2024     1     regular  2024-08-24 16:00:00+00:00   
4  401654665    2024     1     regular  2024-08-24 19:30:00+00:00   

   neutral_site  conference_game  attendance  venue_id  \
0          True            False         NaN       NaN   
1          True            False         NaN       NaN   
2          True             True     47998.0    3504.0   
3          True             True     47998.0    3504.0   
4         False            False     16125.0    3827.0   

                                 venue  home_id       home_team  \
0                                  NaN   124179    Lincoln (CA)   
1                                  NaN   124179    Lincoln (CA)   
2                        Aviva Stad

In [127]:
%pip install seaborn
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [131]:
"""
CFB Power Rankings & Win Prediction Framework

Phase 1: Identify Most Important Variables
Phase 2: Create Preseason Rankings  
Phase 3: Build Season-Cumulative Features
Phase 4: Quantify SOS and Team Rankings
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================================================
# PHASE 0: DATA PREPARATION
# ============================================================================

def filter_games(df):
    """
    Filter to keep FBS vs FBS and FBS vs FCS games.
    Drop FCS vs FCS, and games vs lower divisions.
    """
    print("=" * 70)
    print("FILTERING GAMES")
    print("=" * 70)
    
    # Identify FBS teams (those with SP+ ratings)
    fbs_teams = df[df['team_sp_rating'].notna()]['team'].unique()
    
    print(f"\nFBS teams identified: {len(fbs_teams)}")
    
    # Keep games where at least one team is FBS
    df_filtered = df[
        (df['team'].isin(fbs_teams)) | 
        (df['opponent'].isin(fbs_teams))
    ].copy()
    
    # Classify game types
    df_filtered['game_type'] = 'Unknown'
    df_filtered.loc[
        (df_filtered['team'].isin(fbs_teams)) & 
        (df_filtered['opponent'].isin(fbs_teams)), 
        'game_type'
    ] = 'FBS vs FBS'
    
    df_filtered.loc[
        ((df_filtered['team'].isin(fbs_teams)) & (~df_filtered['opponent'].isin(fbs_teams))) |
        ((~df_filtered['team'].isin(fbs_teams)) & (df_filtered['opponent'].isin(fbs_teams))),
        'game_type'
    ] = 'FBS vs FCS'
    
    print(f"\nOriginal games: {len(df)}")
    print(f"After filter: {len(df_filtered)}")
    print(f"\nGame type breakdown:")
    print(df_filtered['game_type'].value_counts())
    
    return df_filtered, fbs_teams


def create_fcs_rankings(df, fbs_teams):
    """
    Create simple power rankings for FCS teams based on performance vs FBS.
    """
    print("\n" + "=" * 70)
    print("CREATING FCS RANKINGS")
    print("=" * 70)
    
    # Get FCS teams
    fcs_teams = df[~df['team'].isin(fbs_teams)]['team'].unique()
    
    print(f"\nFCS teams: {len(fcs_teams)}")
    
    fcs_rankings = []
    
    for team in fcs_teams:
        # Get games where FCS team played FBS opponent
        team_games = df[
            (df['team'] == team) & 
            (df['opponent'].isin(fbs_teams))
        ]
        
        if len(team_games) == 0:
            continue
        
        # Calculate metrics
        games_played = len(team_games)
        wins = team_games['win'].sum()
        win_pct = wins / games_played if games_played > 0 else 0
        avg_point_diff = team_games['point_differential'].mean()
        avg_opp_sp_rating = team_games['opp_sp_rating'].mean()
        
        # Simple FCS rating formula
        # Base on performance vs FBS, adjusted for opponent quality
        fcs_rating = (
            (win_pct * 100) +  # Win % weight
            (avg_point_diff / 2) +  # Point differential
            (avg_opp_sp_rating * 0.1)  # Opponent strength
        )
        
        fcs_rankings.append({
            'team': team,
            'fcs_rating': fcs_rating,
            'fcs_games_vs_fbs': games_played,
            'fcs_wins_vs_fbs': wins,
            'fcs_win_pct_vs_fbs': win_pct,
            'fcs_avg_point_diff': avg_point_diff
        })
    
    fcs_df = pd.DataFrame(fcs_rankings)
    fcs_df['fcs_rank'] = fcs_df['fcs_rating'].rank(ascending=False)
    
    print(f"\nTop 10 FCS teams:")
    print(fcs_df.nsmallest(10, 'fcs_rank')[
        ['team', 'fcs_rank', 'fcs_rating', 'fcs_games_vs_fbs', 'fcs_wins_vs_fbs']
    ])
    
    return fcs_df


# ============================================================================
# PHASE 1: IDENTIFY MOST IMPORTANT VARIABLES FOR WINNING
# ============================================================================

def analyze_win_predictors(df):
    """
    Use multiple methods to identify which stats are most predictive of wins.
    """
    print("\n" + "=" * 70)
    print("PHASE 1: ANALYZING WIN PREDICTORS")
    print("=" * 70)
    
    # Filter to complete cases (FBS vs FBS with all stats)
    df_analysis = df[
        (df['game_type'] == 'FBS vs FBS') &
        (df['team_points'].notna()) &
        (df['off_ppa'].notna())
    ].copy()
    
    print(f"\nAnalyzing {len(df_analysis)} FBS vs FBS games")
    
    # Define potential predictor variables
    predictor_categories = {
        'efficiency': [
            'off_ppa', 'def_ppa', 'off_success_rate', 'def_success_rate',
            'off_explosiveness', 'def_explosiveness'
        ],
        'traditional_stats': [
            'game_totalYards', 'game_netPassingYards', 'game_rushingYards',
            'game_turnovers', 'game_sacks', 'game_thirdDownEff'
        ],
        'situational': [
            'off_standard_downs_success_rate', 'off_passing_downs_success_rate',
            'off_stuff_rate', 'off_power_success'
        ],
        'field_position': [
            'avg_start_field_position', 'redzone_rate', 'scoring_zone_rate',
            'drive_success_rate'
        ],
        'ratings': [
            'team_sp_rating', 'team_sp_offense', 'team_sp_defense',
            'team_fpi', 'team_recruiting_rank'
        ]
    }
    
    # Flatten all predictors
    all_predictors = []
    for category in predictor_categories.values():
        all_predictors.extend(category)
    
    # Filter to available columns
    available_predictors = [col for col in all_predictors if col in df_analysis.columns]
    
    print(f"Available predictors: {len(available_predictors)}")
    
    # Prepare data
    X = df_analysis[available_predictors].fillna(df_analysis[available_predictors].median())
    y = df_analysis['win']
    
    # Method 1: Random Forest Feature Importance
    print("\n1. Random Forest Feature Importance:")
    print("-" * 60)
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
    rf.fit(X, y)
    
    feature_importance = pd.DataFrame({
        'feature': available_predictors,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 15 Most Important Features (Random Forest):")
    for idx, row in feature_importance.head(15).iterrows():
        print(f"  {row['feature']:40s} {row['importance']:.4f}")
    
    # Method 2: Correlation with winning
    print("\n\n2. Correlation with Winning:")
    print("-" * 60)
    
    correlations = pd.DataFrame({
        'feature': available_predictors,
        'correlation': [X[col].corr(y) for col in available_predictors]
    }).sort_values('correlation', ascending=False, key=abs)
    
    print("\nTop 15 Features by Correlation:")
    for idx, row in correlations.head(15).iterrows():
        print(f"  {row['feature']:40s} {row['correlation']:+.4f}")
    
    # Method 3: Model Performance by Category
    print("\n\n3. Model Performance by Feature Category:")
    print("-" * 60)
    
    category_performance = {}
    
    for category_name, features in predictor_categories.items():
        available_features = [f for f in features if f in available_predictors]
        
        if len(available_features) == 0:
            continue
        
        X_cat = X[available_features]
        
        lr = LogisticRegression(max_iter=1000, random_state=42)
        lr.fit(X_cat, y)
        
        y_pred = lr.predict(X_cat)
        y_pred_proba = lr.predict_proba(X_cat)[:, 1]
        
        accuracy = accuracy_score(y, y_pred)
        auc = roc_auc_score(y, y_pred_proba)
        
        category_performance[category_name] = {
            'accuracy': accuracy,
            'auc': auc,
            'n_features': len(available_features)
        }
        
        print(f"\n  {category_name:20s} Accuracy: {accuracy:.3f}  AUC: {auc:.3f}  ({len(available_features)} features)")
    
    # Summary recommendations
    print("\n" + "=" * 70)
    print("RECOMMENDATIONS FOR WIN PREDICTION")
    print("=" * 70)
    
    top_features = feature_importance.head(10)['feature'].tolist()
    
    print("\n✓ Top 10 Features for Win Prediction:")
    for i, feat in enumerate(top_features, 1):
        print(f"  {i:2d}. {feat}")
    
    return {
        'feature_importance': feature_importance,
        'correlations': correlations,
        'category_performance': category_performance,
        'top_features': top_features
    }


# ============================================================================
# PHASE 2: CREATE PRESEASON RANKINGS
# ============================================================================

def create_preseason_rankings(df, year=2024):
    """
    Create preseason rankings using:
    - Previous season performance
    - Recruiting rankings
    - Returning production (when available)
    - SP+ ratings
    """
    print("\n" + "=" * 70)
    print(f"PHASE 2: CREATING PRESEASON RANKINGS FOR {year}")
    print("=" * 70)
    
    # Get previous season final rankings
    prev_year = year - 1
    prev_season = df[df['season'] == prev_year].copy()
    
    # Calculate end-of-season metrics for each team
    team_metrics = prev_season.groupby('team').agg({
        'win': 'sum',
        'game_id': 'count',
        'point_differential': 'sum',
        'off_ppa': 'mean',
        'def_ppa': 'mean',
        'team_sp_rating': 'last',
        'team_recruiting_rank': 'last'
    }).reset_index()
    
    team_metrics.columns = [
        'team', 'prev_wins', 'prev_games', 'prev_point_diff',
        'prev_off_ppa', 'prev_def_ppa', 'prev_sp_rating', 'recruiting_rank'
    ]
    
    # Calculate win percentage
    team_metrics['prev_win_pct'] = team_metrics['prev_wins'] / team_metrics['prev_games']
    
    # Create composite preseason rating
    # Normalize each component
    team_metrics['win_score'] = (team_metrics['prev_win_pct'] - 0.5) * 100
    team_metrics['sp_score'] = team_metrics['prev_sp_rating'].fillna(0)
    team_metrics['recruiting_score'] = -(team_metrics['recruiting_rank'].fillna(100) - 50)
    
    # Weighted preseason score
    team_metrics['preseason_rating'] = (
        team_metrics['win_score'] * 0.3 +
        team_metrics['sp_score'] * 0.4 +
        team_metrics['recruiting_score'] * 0.3
    )
    
    # Rank teams
    team_metrics['preseason_rank'] = team_metrics['preseason_rating'].rank(ascending=False)
    
    team_metrics = team_metrics.sort_values('preseason_rank')
    
    print(f"\nTop 25 Preseason Rankings for {year}:")
    print(team_metrics.head(25)[
        ['team', 'preseason_rank', 'preseason_rating', 'prev_wins', 'prev_sp_rating', 'recruiting_rank']
    ].to_string(index=False))
    
    return team_metrics[['team', 'preseason_rank', 'preseason_rating']]


# ============================================================================
# PHASE 3: BUILD SEASON-CUMULATIVE FEATURES
# ============================================================================

def create_cumulative_features(df):
    """
    For each game, calculate season-to-date stats up to (but not including) that game.
    """
    print("\n" + "=" * 70)
    print("PHASE 3: CREATING SEASON-CUMULATIVE FEATURES")
    print("=" * 70)
    
    # Sort by team, season, and date
    df = df.sort_values(['team', 'season', 'date']).reset_index(drop=True)
    
    # Stats to track cumulatively
    cumulative_stats = [
        'team_points', 'opponent_points', 'point_differential', 'win',
        'off_ppa', 'def_ppa', 'off_success_rate', 'def_success_rate',
        'off_explosiveness', 'def_explosiveness',
        'game_totalYards', 'game_turnovers', 'game_sacks'
    ]
    
    # Filter to available stats
    available_stats = [s for s in cumulative_stats if s in df.columns]
    
    print(f"\nTracking {len(available_stats)} cumulative statistics")
    
    # Create cumulative features
    for stat in available_stats:
        # Expanding mean (excludes current game)
        df[f'{stat}_season_avg'] = (
            df.groupby(['team', 'season'])[stat]
            .transform(lambda x: x.shift(1).expanding().mean())
        )
        
        # Last 3 games average
        df[f'{stat}_L3'] = (
            df.groupby(['team', 'season'])[stat]
            .transform(lambda x: x.shift(1).rolling(window=3, min_periods=1).mean())
        )
    
    # Games played so far
    df['games_played'] = df.groupby(['team', 'season']).cumcount()
    
    # Current record
    df['season_wins'] = (
        df.groupby(['team', 'season'])['win']
        .transform(lambda x: x.shift(1).cumsum())
    )
    
    df['season_win_pct'] = df['season_wins'] / df['games_played'].replace(0, 1)
    
    print(f"\n✓ Created cumulative features")
    print(f"  Season averages: {len([c for c in df.columns if '_season_avg' in c])}")
    print(f"  Last 3 game averages: {len([c for c in df.columns if '_L3' in c])}")
    
    return df


# ============================================================================
# PHASE 4: QUANTIFY STRENGTH OF SCHEDULE & RANKINGS
# ============================================================================

def calculate_sos_and_rankings(df):
    """
    Calculate strength of schedule and create power rankings throughout the season.
    """
    print("\n" + "=" * 70)
    print("PHASE 4: CALCULATING SOS & POWER RANKINGS")
    print("=" * 70)
    
    # Sort by season and date
    df = df.sort_values(['season', 'date']).reset_index(drop=True)
    
    # For each game, calculate opponent's current rating
    print("\n1. Calculating Strength of Schedule...")
    
    # Average opponent SP+ rating faced (season-to-date)
    df['sos_sp_rating'] = (
        df.groupby(['team', 'season'])['opp_sp_rating']
        .transform(lambda x: x.expanding().mean())
    )
    
    # For SOS based on opponent win %, we need to calculate it differently
    # Since we can't easily map opponent names to their current records
    # We'll use a simpler approach: average of opponent's SP+ ratings
    
    # Create power rankings
    print("\n2. Creating Weekly Power Rankings...")
    
    # Calculate composite power rating for each team-week
    df['power_rating'] = 0.0
    
    # Ensure numeric columns
    df['season_win_pct'] = pd.to_numeric(df['season_win_pct'], errors='coerce')
    df['point_differential_season_avg'] = pd.to_numeric(df['point_differential_season_avg'], errors='coerce')
    df['team_sp_rating'] = pd.to_numeric(df['team_sp_rating'], errors='coerce')
    df['sos_sp_rating'] = pd.to_numeric(df['sos_sp_rating'], errors='coerce')
    
    # Components of power rating:
    # 1. Season win percentage (30%)
    df['power_rating'] += df['season_win_pct'].fillna(0.5) * 30
    
    # 2. Point differential per game (20%)
    df['power_rating'] += (df['point_differential_season_avg'].fillna(0) / 10) * 20
    
    # 3. SP+ rating (30%)
    df['power_rating'] += (df['team_sp_rating'].fillna(0) / 2) * 30
    
    # 4. Strength of schedule adjusted (20%)
    df['power_rating'] += (df['sos_sp_rating'].fillna(0) / 5) * 20
    
    # Rank teams within each week
    df['current_rank'] = df.groupby(['season', 'week'])['power_rating'].rank(ascending=False, method='min')
    
    # End of season rankings (after last game)
    end_of_season = df.groupby(['team', 'season']).last().reset_index()
    end_of_season['final_rank'] = end_of_season.groupby('season')['power_rating'].rank(ascending=False)
    
    # Merge final rank back
    df = df.merge(
        end_of_season[['team', 'season', 'final_rank']],
        on=['team', 'season'],
        how='left',
        suffixes=('', '_dup')
    )
    
    # Drop duplicate columns if any
    df = df[[c for c in df.columns if not c.endswith('_dup')]]
    
    print(f"\n✓ Strength of Schedule calculated")
    print(f"✓ Power rankings created (current_rank and final_rank)")
    
    return df


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def run_complete_analysis(df_raw):
    """
    Run complete analysis pipeline.
    """
    print("\n" + "=" * 70)
    print("CFB POWER RANKINGS & WIN PREDICTION FRAMEWORK")
    print("=" * 70)
    
    # Phase 0: Filter data
    df_filtered, fbs_teams = filter_games(df_raw)
    fcs_rankings = create_fcs_rankings(df_filtered, fbs_teams)
    
    # Merge FCS rankings back to main dataset
    df_filtered = df_filtered.merge(
        fcs_rankings[['team', 'fcs_rating', 'fcs_rank']],
        on='team',
        how='left'
    )
    
    # Phase 1: Identify important variables
    win_analysis = analyze_win_predictors(df_filtered)
    
    # Phase 2: Preseason rankings (for 2024)
    preseason_2024 = create_preseason_rankings(df_filtered, year=2024)
    
    # Phase 3: Cumulative features
    df_with_cumulative = create_cumulative_features(df_filtered)
    
    # Phase 4: SOS and rankings
    df_final = calculate_sos_and_rankings(df_with_cumulative)
    
    print("\n" + "=" * 70)
    print("ANALYSIS COMPLETE!")
    print("=" * 70)
    print(f"\nFinal dataset: {df_final.shape}")
    print(f"  Original: {df_raw.shape[0]} rows, {df_raw.shape[1]} columns")
    print(f"  Final: {df_final.shape[0]} rows, {df_final.shape[1]} columns")
    print(f"  Added: {df_final.shape[1] - df_raw.shape[1]} new feature columns")
    
    return df_final, win_analysis, preseason_2024, fcs_rankings


# ============================================================================
# USAGE
# ============================================================================

if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('cfb_complete_data_2024_to_2024.csv')
    
    # Run analysis
    df_final, win_analysis, preseason_rankings, fcs_rankings = run_complete_analysis(df)
    
    # Save results
    df_final.to_csv('cfb_data_with_rankings.csv', index=False)
    win_analysis['feature_importance'].to_csv('feature_importance.csv', index=False)
    preseason_rankings.to_csv('preseason_rankings_2024.csv', index=False)
    fcs_rankings.to_csv('fcs_rankings.csv', index=False)
    
    print("\n✓ All results saved!")
    print("  - cfb_data_with_rankings.csv")
    print("  - feature_importance.csv")
    print("  - preseason_rankings_2024.csv")
    print("  - fcs_rankings.csv")


CFB POWER RANKINGS & WIN PREDICTION FRAMEWORK
FILTERING GAMES

FBS teams identified: 134

Original games: 7494
After filter: 1748

Game type breakdown:
game_type
FBS vs FBS    1506
FBS vs FCS     242
Name: count, dtype: int64

CREATING FCS RANKINGS

FCS teams: 96

Top 10 FCS teams:
                  team  fcs_rank  fcs_rating  fcs_games_vs_fbs  \
0        Montana State       1.0     100.520                 1   
88            Monmouth       2.0     100.290                 1   
51               Idaho       3.0      49.120                 2   
47           UT Martin       4.0      43.450                 2   
64    St. Francis (PA)       5.0      40.185                 2   
18       Southern Utah       6.0      37.750                 2   
52   Abilene Christian       7.0      -0.070                 1   
15  North Dakota State       8.0      -1.690                 1   
69        Gardner-Webb       9.0      -2.390                 2   
49    Central Arkansas      10.0      -2.490            

TypeError: Cannot convert [['5-9' '5-12' '8-18' ... '7-13' '5-14' '7-13']] to numeric