# 🏆 FPL-AI Data Collection Notebook

## Overview
This notebook collects comprehensive Fantasy Premier League data from multiple free sources:
- **Official FPL API**: Player stats, fixtures, gameweek data
- **Injury Data**: Player availability and fitness status
- **Advanced Stats**: Team strength and performance metrics

## Expected Runtime: 10-15 minutes
## Data Sources: 100% Free APIs and Web Scraping

In [None]:
# Cell 1: Environment Setup and Google Drive Integration
print("🚀 Setting up FPL-AI Data Collection Environment...")

# Install required packages
!pip install -q requests beautifulsoup4 pandas numpy tqdm lxml fake-useragent
!pip install -q pyyaml joblib

# Mount Google Drive for data persistence
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
project_dir = '/content/drive/MyDrive/FPL_AI_Project'
os.makedirs(project_dir, exist_ok=True)
os.makedirs(f'{project_dir}/data/raw', exist_ok=True)
os.makedirs(f'{project_dir}/data/processed', exist_ok=True)
os.makedirs(f'{project_dir}/models', exist_ok=True)

print("✅ Environment setup complete!")
print(f"📁 Project directory: {project_dir}")

In [None]:
# Cell 2: Import Libraries and Configure APIs
import requests
import pandas as pd
import numpy as np
import time
import json
from datetime import datetime, timedelta
import logging
from pathlib import Path
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# API Configuration
FPL_API_BASE = "https://fantasy.premierleague.com/api/"
PREMIER_INJURIES_URL = "https://www.premierinjuries.com/injury-table.php"
RATE_LIMIT_DELAY = 1.0  # seconds between requests

# Headers for web scraping
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

print("📚 Libraries imported successfully!")
print(f"🌐 FPL API Base: {FPL_API_BASE}")

In [None]:
# Cell 3: FPL API Data Collection Functions

class FPLDataCollector:
    """Comprehensive FPL data collector with rate limiting."""
    
    def __init__(self, base_url=FPL_API_BASE):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        
    def _make_request(self, endpoint):
        """Make rate-limited request to FPL API."""
        time.sleep(RATE_LIMIT_DELAY)
        
        try:
            response = self.session.get(f"{self.base_url}{endpoint}", timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"❌ Error fetching {endpoint}: {e}")
            return None
    
    def get_bootstrap_data(self):
        """Get comprehensive bootstrap data."""
        print("📊 Fetching bootstrap static data...")
        return self._make_request("bootstrap-static/")
    
    def get_fixtures(self):
        """Get fixture data."""
        print("⚽ Fetching fixtures data...")
        return self._make_request("fixtures/")
    
    def get_gameweek_live(self, gameweek):
        """Get live gameweek data."""
        print(f"🔴 Fetching live data for gameweek {gameweek}...")
        return self._make_request(f"event/{gameweek}/live/")
    
    def collect_historical_gameweeks(self, start_gw=1, end_gw=10):
        """Collect historical gameweek data."""
        print(f"📈 Collecting historical data from GW{start_gw} to GW{end_gw}...")
        
        all_data = []
        for gw in tqdm(range(start_gw, end_gw + 1), desc="Gameweeks"):
            gw_data = self.get_gameweek_live(gw)
            if gw_data and 'elements' in gw_data:
                for element in gw_data['elements']:
                    stats = element['stats']
                    stats['player_id'] = element['id']
                    stats['gameweek'] = gw
                    all_data.append(stats)
        
        return pd.DataFrame(all_data) if all_data else pd.DataFrame()

# Initialize collector
collector = FPLDataCollector()
print("✅ FPL Data Collector initialized!")

In [None]:
# Cell 4: Injury Data Scraping Functions

class InjuryDataScraper:
    """Scraper for Premier League injury data."""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        
        # Team name standardization
        self.team_mapping = {
            'Man City': 'Manchester City',
            'Man Utd': 'Manchester United',
            'Newcastle': 'Newcastle United',
            'Brighton': 'Brighton & Hove Albion',
            'Nott\'m Forest': 'Nottingham Forest',
            'Sheffield Utd': 'Sheffield United',
            'West Ham': 'West Ham United',
            'Wolves': 'Wolverhampton Wanderers'
        }
        
        # Injury severity mapping
        self.severity_mapping = {
            'knock': 1, 'minor': 1, 'fatigue': 1, 'slight': 1,
            'muscle': 2, 'strain': 2, 'sprain': 2, 'bruise': 2,
            'fracture': 4, 'break': 4, 'torn': 4,
            'rupture': 5, 'surgery': 5, 'operation': 5, 'long-term': 5
        }
    
    def scrape_premier_injuries(self):
        """Scrape injury data from Premier Injuries website."""
        print("🏥 Scraping injury data from Premier Injuries...")
        
        try:
            time.sleep(RATE_LIMIT_DELAY)
            response = self.session.get(PREMIER_INJURIES_URL, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find injury table (try multiple selectors)
            table = (soup.find('table', class_='injury-table') or 
                    soup.find('table', id='injury-table') or
                    soup.find('table'))
            
            if not table:
                print("⚠️ Could not find injury table")
                return pd.DataFrame()
            
            injuries = []
            rows = table.find_all('tr')[1:]  # Skip header
            
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if len(cells) >= 4:
                    injury_data = {
                        'player_name': cells[0].get_text(strip=True),
                        'team': cells[1].get_text(strip=True),
                        'injury_type': cells[2].get_text(strip=True),
                        'status': cells[3].get_text(strip=True),
                        'expected_return': cells[4].get_text(strip=True) if len(cells) > 4 else '',
                        'last_updated': datetime.now().strftime('%Y-%m-%d')
                    }
                    injuries.append(injury_data)
            
            df = pd.DataFrame(injuries)
            if not df.empty:
                df = self._process_injury_data(df)
            
            print(f"✅ Scraped {len(df)} injury records")
            return df
            
        except Exception as e:
            print(f"❌ Error scraping injury data: {e}")
            return pd.DataFrame()
    
    def _process_injury_data(self, df):
        """Process and enhance injury data."""
        # Standardize team names
        df['team_standardized'] = df['team'].map(self.team_mapping).fillna(df['team'])
        
        # Calculate injury severity
        df['injury_severity'] = df['injury_type'].str.lower().apply(
            lambda x: max([self.severity_mapping.get(keyword, 2) 
                          for keyword in self.severity_mapping.keys() 
                          if keyword in str(x)], default=2)
        )
        
        # Estimate availability probability
        df['availability_probability'] = 1.0 - (df['injury_severity'] * 0.15)
        df['availability_probability'] = df['availability_probability'].clip(0, 1)
        
        return df

# Initialize injury scraper
injury_scraper = InjuryDataScraper()
print("✅ Injury Data Scraper initialized!")

In [None]:
# Cell 5: Execute Comprehensive Data Collection

print("🎯 Starting comprehensive FPL data collection...")
print("⏱️ Estimated time: 5-10 minutes")
print("="*50)

# Dictionary to store all collected data
collected_data = {}

# 1. Get bootstrap static data (players, teams, gameweeks)
print("\n📊 Step 1: Collecting bootstrap data...")
bootstrap_data = collector.get_bootstrap_data()

if bootstrap_data:
    # Extract individual datasets
    players_df = pd.DataFrame(bootstrap_data['elements'])
    teams_df = pd.DataFrame(bootstrap_data['teams'])
    gameweeks_df = pd.DataFrame(bootstrap_data['events'])
    positions_df = pd.DataFrame(bootstrap_data['element_types'])
    
    # Enhance players data with team and position info
    players_df = players_df.merge(
        teams_df[['id', 'name', 'short_name']].rename(columns={
            'id': 'team', 'name': 'team_name', 'short_name': 'team_short'
        }), on='team'
    )
    
    players_df = players_df.merge(
        positions_df[['id', 'singular_name']].rename(columns={
            'id': 'element_type', 'singular_name': 'position'
        }), on='element_type'
    )
    
    collected_data['players'] = players_df
    collected_data['teams'] = teams_df
    collected_data['gameweeks'] = gameweeks_df
    
    print(f"✅ Players: {len(players_df)}, Teams: {len(teams_df)}, Gameweeks: {len(gameweeks_df)}")
else:
    print("❌ Failed to collect bootstrap data")

# 2. Get fixtures data
print("\n⚽ Step 2: Collecting fixtures data...")
fixtures_data = collector.get_fixtures()

if fixtures_data:
    fixtures_df = pd.DataFrame(fixtures_data)
    collected_data['fixtures'] = fixtures_df
    print(f"✅ Fixtures: {len(fixtures_df)}")
else:
    print("❌ Failed to collect fixtures data")

# 3. Get historical gameweek data
print("\n📈 Step 3: Collecting historical gameweek data...")
# Determine current gameweek
if 'gameweeks' in collected_data:
    current_gw = collected_data['gameweeks'][collected_data['gameweeks']['is_current'] == True]
    if not current_gw.empty:
        current_gameweek = current_gw['id'].iloc[0]
        end_gw = min(current_gameweek - 1, 15)  # Don't go beyond GW15 for demo
        
        historical_df = collector.collect_historical_gameweeks(1, end_gw)
        if not historical_df.empty:
            collected_data['historical_gameweeks'] = historical_df
            print(f"✅ Historical data: {len(historical_df)} player-gameweek records")
        else:
            print("❌ No historical gameweek data collected")
    else:
        print("⚠️ Could not determine current gameweek")

# 4. Collect injury data
print("\n🏥 Step 4: Collecting injury data...")
injury_df = injury_scraper.scrape_premier_injuries()

if not injury_df.empty:
    collected_data['injuries'] = injury_df
    print(f"✅ Injury data: {len(injury_df)} records")
else:
    print("⚠️ No injury data collected (this is optional)")

print("\n" + "="*50)
print(f"🎉 Data collection completed!")
print(f"📊 Total datasets collected: {len(collected_data)}")

# Display summary
for dataset_name, dataset in collected_data.items():
    print(f"  📁 {dataset_name}: {len(dataset)} records")

In [None]:
# Cell 6: Save Data to Google Drive and Validation

print("💾 Saving collected data to Google Drive...")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save each dataset
saved_files = []
for dataset_name, dataset in collected_data.items():
    if isinstance(dataset, pd.DataFrame) and not dataset.empty:
        filename = f"{project_dir}/data/raw/{timestamp}_{dataset_name}.csv"
        dataset.to_csv(filename, index=False)
        saved_files.append(filename)
        print(f"✅ Saved {dataset_name}: {filename}")

# Save metadata
metadata = {
    'collection_timestamp': timestamp,
    'datasets_collected': list(collected_data.keys()),
    'total_files': len(saved_files),
    'file_paths': saved_files
}

import json
metadata_file = f"{project_dir}/data/raw/{timestamp}_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\n📋 Metadata saved: {metadata_file}")

# Data validation and summary
print("\n🔍 Data Validation Summary:")
print("="*40)

if 'players' in collected_data:
    players = collected_data['players']
    print(f"👥 Players by position:")
    print(players['position'].value_counts())
    print(f"\n💰 Price range: £{players['now_cost'].min()/10:.1f}m - £{players['now_cost'].max()/10:.1f}m")

if 'historical_gameweeks' in collected_data:
    historical = collected_data['historical_gameweeks']
    print(f"\n📊 Historical data:")
    print(f"  Gameweeks: {historical['gameweek'].min()} - {historical['gameweek'].max()}")
    print(f"  Total points range: {historical['total_points'].min()} - {historical['total_points'].max()}")

if 'injuries' in collected_data:
    injuries = collected_data['injuries']
    print(f"\n🏥 Injury summary:")
    print(f"  Teams with injuries: {injuries['team_standardized'].nunique()}")
    print(f"  Avg availability probability: {injuries['availability_probability'].mean():.2f}")

print("\n" + "="*50)
print("🎯 DATA COLLECTION COMPLETE!")
print("🚀 Ready for feature engineering and model training!")
print(f"📁 All data saved in: {project_dir}/data/raw/")

# Create a downloadable summary
summary_data = {
    'timestamp': timestamp,
    'total_players': len(collected_data.get('players', [])),
    'total_fixtures': len(collected_data.get('fixtures', [])),
    'historical_records': len(collected_data.get('historical_gameweeks', [])),
    'injury_records': len(collected_data.get('injuries', [])),
    'next_steps': [
        "Run FPL_Feature_Engineering.ipynb",
        "Train position-specific models",
        "Create ensemble predictor",
        "Build dashboard"
    ]
}

print("\n📋 Collection Summary:")
for key, value in summary_data.items():
    if key != 'next_steps':
        print(f"  {key}: {value}")