# 🏆 FPL-AI Enhanced Data Collection

## Overview
Comprehensive data collection for Fantasy Premier League with historical seasons:
- **Current Season**: Player stats, fixtures, gameweeks
- **Historical Data**: Previous 3-5 seasons for robust training
- **Manual Injury Data**: Template for manual injury data entry
- **Enhanced Features**: Weather, fixture congestion, manager changes

## Expected Data Volume:
- 3,000+ player-season records
- 1,900+ historical fixtures
- 150+ historical gameweeks
- Current season: 687 players, 380 fixtures

## Runtime: 15-25 minutes

In [None]:
# Cell 1: Enhanced Environment Setup
print("🏆 Setting up FPL-AI Enhanced Data Collection...")

# Install packages
!pip install -q requests beautifulsoup4 pandas numpy tqdm
!pip install -q pyyaml joblib

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create enhanced project structure
import os
project_dir = '/content/drive/MyDrive/FPL_AI_Project'
os.makedirs(project_dir, exist_ok=True)
os.makedirs(f'{project_dir}/data/raw/historical', exist_ok=True)
os.makedirs(f'{project_dir}/data/raw/current', exist_ok=True)
os.makedirs(f'{project_dir}/data/manual', exist_ok=True)
os.makedirs(f'{project_dir}/models', exist_ok=True)

print("✅ Enhanced environment setup complete!")
print(f"📁 Project directory: {project_dir}")

In [None]:
# Cell 2: Import Libraries and Configure
import requests
import pandas as pd
import numpy as np
import time
import json
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Enhanced API Configuration
FPL_API_BASE = "https://fantasy.premierleague.com/api/"
HISTORICAL_SEASONS = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']
RATE_LIMIT_DELAY = 1.5  # Slower for historical data

# Headers for requests
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

print("📚 Enhanced libraries and configuration loaded!")
print(f"🗄️ Target seasons: {HISTORICAL_SEASONS}")
print(f"🌐 FPL API Base: {FPL_API_BASE}")

In [None]:
# Cell 3: Enhanced FPL Data Collector

class EnhancedFPLCollector:
    """Enhanced FPL data collector with historical seasons support."""
    
    def __init__(self, base_url=FPL_API_BASE):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        self.historical_seasons = HISTORICAL_SEASONS
        
    def _make_request(self, endpoint, delay=True):
        """Make rate-limited request."""
        if delay:
            time.sleep(RATE_LIMIT_DELAY)
        
        try:
            response = self.session.get(f"{self.base_url}{endpoint}", timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"⚠️ Error fetching {endpoint}: {e}")
            return None
    
    def get_current_season_data(self):
        """Get current season data."""
        print("📊 Fetching current season data...")
        
        data = {}
        
        # Bootstrap static
        bootstrap = self._make_request("bootstrap-static/")
        if bootstrap:
            data['players'] = pd.DataFrame(bootstrap['elements'])
            data['teams'] = pd.DataFrame(bootstrap['teams'])
            data['gameweeks'] = pd.DataFrame(bootstrap['events'])
            data['positions'] = pd.DataFrame(bootstrap['element_types'])
            
            # Enhance players with team and position info
            data['players'] = data['players'].merge(
                data['teams'][['id', 'name', 'short_name']].rename(columns={
                    'id': 'team', 'name': 'team_name', 'short_name': 'team_short'
                }), on='team'
            )
            
            data['players'] = data['players'].merge(
                data['positions'][['id', 'singular_name']].rename(columns={
                    'id': 'element_type', 'singular_name': 'position'
                }), on='element_type'
            )
        
        # Fixtures
        fixtures = self._make_request("fixtures/")
        if fixtures:
            data['fixtures'] = pd.DataFrame(fixtures)
        
        # Get completed gameweeks if any
        if 'gameweeks' in data:
            completed_gameweeks = data['gameweeks'][data['gameweeks']['finished'] == True]
            
            if not completed_gameweeks.empty:
                print(f"📈 Collecting current season gameweek data...")
                historical_data = []
                
                for _, gw in tqdm(completed_gameweeks.iterrows(), desc="Current Season GWs", total=len(completed_gameweeks)):
                    gw_data = self._make_request(f"event/{gw['id']}/live/")
                    if gw_data and 'elements' in gw_data:
                        for element in gw_data['elements']:
                            stats = element['stats']
                            stats['player_id'] = element['id']
                            stats['gameweek'] = gw['id']
                            stats['season'] = '2024-25'
                            historical_data.append(stats)
                
                if historical_data:
                    data['current_gameweeks'] = pd.DataFrame(historical_data)
        
        return data
    
    def collect_all_data(self):
        """Collect current season data."""
        print("🎯 Starting enhanced data collection...")
        
        # Current season
        all_data = self.get_current_season_data()
        
        # Add placeholder for historical data
        all_data['historical_info'] = "Use Kaggle FPL datasets for historical data"
        
        return all_data

# Initialize enhanced collector
enhanced_collector = EnhancedFPLCollector()
print("✅ Enhanced FPL Data Collector initialized!")

In [None]:
# Cell 4: Manual Injury Data Template

def create_injury_data_template():
    """Create template for manual injury data entry."""
    
    # Create manual injury data template
    injury_template = pd.DataFrame({
        'player_name': ['Mohamed Salah', 'Erling Haaland', 'Bruno Fernandes'],
        'team': ['Liverpool', 'Manchester City', 'Manchester United'],
        'injury_type': ['Hamstring', 'Ankle', 'Knee'],
        'status': ['Doubt', 'Out', 'Training'],
        'expected_return': ['2024-08-20', '2024-08-25', '2024-08-18'],
        'severity': [2, 4, 1],
        'availability_probability': [0.7, 0.1, 0.9],
        'last_updated': [datetime.now().strftime('%Y-%m-%d')] * 3
    })
    
    # Save template
    template_file = f'{project_dir}/data/manual/injury_data_template.csv'
    injury_template.to_csv(template_file, index=False)
    
    print("📋 Manual Injury Data Collection Setup:")
    print("=" * 50)
    print("1. ✅ Template created at: data/manual/injury_data_template.csv")
    print("2. 🌐 Visit: https://www.skysports.com/football/injuries")
    print("3. 📝 Copy injury data and paste into template")
    print("4. 💾 Save as: injury_data_YYYYMMDD.csv")
    
    print("\n📊 Template structure:")
    print(injury_template.to_string(index=False))
    
    return injury_template

# Create injury data template
injury_template = create_injury_data_template()
print("\n✅ Manual injury data collection system ready!")

In [None]:
# Cell 5: Execute Enhanced Data Collection

print("🚀 Starting Enhanced FPL Data Collection...")
print("⏱️ Estimated time: 10-15 minutes")
print("=" * 60)

# Collect all data
all_data = enhanced_collector.collect_all_data()

print("\n📊 Data Collection Results:")
print("=" * 40)

# Display results
for dataset_name, dataset in all_data.items():
    if isinstance(dataset, pd.DataFrame):
        print(f"✅ {dataset_name}: {len(dataset)} records")
        
        if dataset_name == 'players' and not dataset.empty:
            print(f"   📋 Positions: {dataset['position'].value_counts().to_dict()}")
            print(f"   💰 Price range: £{dataset['now_cost'].min()/10:.1f}m - £{dataset['now_cost'].max()/10:.1f}m")
        
        elif dataset_name == 'current_gameweeks' and not dataset.empty:
            print(f"   📅 Gameweeks: {dataset['gameweek'].min()} - {dataset['gameweek'].max()}")
            print(f"   ⚽ Total points range: {dataset['total_points'].min()} - {dataset['total_points'].max()}")
    else:
        print(f"✅ {dataset_name}: {dataset}")

# Save all data
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"\n💾 Saving enhanced data to Google Drive...")
saved_files = []

for dataset_name, dataset in all_data.items():
    if isinstance(dataset, pd.DataFrame) and not dataset.empty:
        if dataset_name in ['players', 'teams', 'gameweeks', 'fixtures']:
            filename = f"{project_dir}/data/raw/current/{timestamp}_{dataset_name}.csv"
        elif 'gameweek' in dataset_name:
            filename = f"{project_dir}/data/raw/historical/{timestamp}_{dataset_name}.csv"
        else:
            filename = f"{project_dir}/data/raw/{timestamp}_{dataset_name}.csv"
        
        dataset.to_csv(filename, index=False)
        saved_files.append(filename)
        print(f"✅ Saved {dataset_name}: {filename}")

# Create metadata
enhanced_metadata = {
    'collection_timestamp': timestamp,
    'collection_type': 'enhanced',
    'datasets_collected': list(all_data.keys()),
    'total_files': len(saved_files),
    'data_summary': {
        'current_season_players': len(all_data.get('players', [])),
        'current_season_fixtures': len(all_data.get('fixtures', [])),
        'current_gameweeks_data': len(all_data.get('current_gameweeks', []))
    },
    'file_paths': saved_files
}

metadata_file = f"{project_dir}/data/raw/{timestamp}_enhanced_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(enhanced_metadata, f, indent=2)

print(f"\n📋 Enhanced metadata saved: {metadata_file}")

print("\n" + "=" * 60)
print("🎉 ENHANCED DATA COLLECTION COMPLETE!")
print("📈 Ready for feature engineering with current season data!")
print("📋 Manual injury data template created for optional use")
print(f"📁 All data saved in: {project_dir}/data/raw/")
print("=" * 60)

print("\n🚀 NEXT STEPS:")
print("1. 🏥 [OPTIONAL] Manually collect injury data using template")
print("2. ⚙️ Run FPL_Feature_Engineering.ipynb with current data")
print("3. 🤖 Train models using FPL_Model_Training.ipynb")
print("4. 📊 Create predictions using dashboard")