# 🤖 FPL-AI Model Training Notebook

## Overview
This notebook trains position-specific machine learning models for Fantasy Premier League predictions:
- **Goalkeeper Model**: Clean sheets, saves, bonus points
- **Defender Model**: Clean sheets + attacking returns
- **Midfielder Model**: Goals (5pts), assists, creativity
- **Forward Model**: Goals (4pts), assists, penalty likelihood

## ML Techniques Used:
- XGBoost, LightGBM, CatBoost ensemble
- Optuna hyperparameter optimization
- Time series cross-validation
- Custom FPL scoring loss functions
- SHAP explainability

## Expected Runtime: 30-45 minutes with GPU

In [None]:
# Cell 1: Environment Setup and GPU Configuration
print("🚀 Setting up FPL-AI Model Training Environment...")

# Check GPU availability
import tensorflow as tf
print(f"🔥 GPU Available: {tf.config.list_physical_devices('GPU')}")

# Install ML packages
!pip install -q xgboost lightgbm catboost optuna
!pip install -q scikit-learn==1.3.0 shap lime
!pip install -q plotly seaborn matplotlib
!pip install -q tqdm ipywidgets

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set project directory
import os
project_dir = '/content/drive/MyDrive/FPL_AI_Project'
os.chdir(project_dir)

# Create model directories
os.makedirs('models/trained_models/goalkeeper', exist_ok=True)
os.makedirs('models/trained_models/defender', exist_ok=True)
os.makedirs('models/trained_models/midfielder', exist_ok=True)
os.makedirs('models/trained_models/forward', exist_ok=True)
os.makedirs('models/ensemble', exist_ok=True)

print("✅ Environment setup complete!")
print(f"📁 Working directory: {os.getcwd()}")

In [None]:
# Cell 2: Import Libraries and Load Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ML libraries
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import optuna
import shap

# Utilities
import joblib
import json
from datetime import datetime
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 All libraries imported successfully!")

# Load the most recent data
print("\n📂 Loading processed feature data...")
# List available feature files
import glob
feature_files = glob.glob('data/processed/*features*.csv')
if feature_files:
    latest_features = max(feature_files, key=os.path.getctime)
    features_df = pd.read_csv(latest_features)
    print(f"✅ Loaded features: {latest_features}")
    print(f"📊 Features shape: {features_df.shape}")
else:
    print("❌ No feature files found. Please run FPL_Feature_Engineering.ipynb first.")
    features_df = pd.DataFrame()

# Load historical gameweek data for targets
target_files = glob.glob('data/raw/*historical_gameweeks*.csv')
if target_files:
    latest_targets = max(target_files, key=os.path.getctime)
    targets_df = pd.read_csv(latest_targets)
    print(f"✅ Loaded targets: {latest_targets}")
    print(f"📊 Targets shape: {targets_df.shape}")
else:
    print("❌ No target files found. Please run FPL_Data_Collection.ipynb first.")
    targets_df = pd.DataFrame()

In [None]:
# Cell 3: Base Model Class and Utilities

class FPLPositionModel:
    """Base class for position-specific FPL models."""
    
    def __init__(self, position, target_columns, feature_columns):
        self.position = position
        self.target_columns = target_columns
        self.feature_columns = feature_columns
        self.models = {}
        self.scalers = {}
        self.feature_importance = {}
        self.training_metrics = {}
        self.is_trained = False
        
    def prepare_data(self, features_df, targets_df):
        """Prepare training data for this position."""
        # Filter by position
        position_mask = features_df['position'] == self.position
        position_features = features_df[position_mask].copy()
        
        if position_features.empty:
            return None, None
        
        # Select available features
        available_features = [col for col in self.feature_columns if col in position_features.columns]
        X = position_features[available_features].fillna(0)
        
        # Merge with targets
        merged_data = position_features[['player_id', 'gameweek']].merge(
            targets_df[['player_id', 'gameweek'] + self.target_columns],
            on=['player_id', 'gameweek'],
            how='inner'
        )
        
        if merged_data.empty:
            return None, None
        
        # Align features with merged data
        X = X.loc[X.index.isin(merged_data.index)]
        y = merged_data[self.target_columns]
        
        return X, y
    
    def optimize_hyperparameters(self, X, y, target, n_trials=50):
        """Optimize hyperparameters using Optuna."""
        
        def objective(trial):
            # Suggest hyperparameters
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
            }
            
            # Create model
            model = xgb.XGBRegressor(**params, random_state=42, n_jobs=-1)
            
            # Time series cross-validation
            tscv = TimeSeriesSplit(n_splits=3)
            scores = []
            
            for train_idx, val_idx in tscv.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                
                model.fit(X_train, y_train)
                pred = model.predict(X_val)
                score = mean_squared_error(y_val, pred, squared=False)
                scores.append(score)
            
            return np.mean(scores)
        
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
        
        return study.best_params
    
    def train_model(self, X, y, target, optimize=True):
        """Train model for a specific target."""
        # Remove missing targets
        valid_mask = ~y[target].isna()
        X_clean = X[valid_mask]
        y_clean = y[target][valid_mask]
        
        if len(y_clean) < 50:
            print(f"⚠️ Insufficient data for {target}: {len(y_clean)} samples")
            return None
        
        # Scale features
        scaler = RobustScaler()
        X_scaled = pd.DataFrame(
            scaler.fit_transform(X_clean),
            columns=X_clean.columns,
            index=X_clean.index
        )
        self.scalers[target] = scaler
        
        # Optimize hyperparameters
        if optimize:
            print(f"🔧 Optimizing hyperparameters for {target}...")
            best_params = self.optimize_hyperparameters(X_scaled, y_clean, target)
        else:
            best_params = {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 6}
        
        # Train final model
        model = xgb.XGBRegressor(**best_params, random_state=42, n_jobs=-1)
        model.fit(X_scaled, y_clean)
        
        # Store model and evaluate
        self.models[target] = model
        
        # Cross-validation metrics
        tscv = TimeSeriesSplit(n_splits=5)
        cv_scores = cross_val_score(model, X_scaled, y_clean, cv=tscv, 
                                   scoring='neg_mean_squared_error')
        
        metrics = {
            'cv_rmse': np.sqrt(-cv_scores.mean()),
            'cv_std': cv_scores.std(),
            'n_samples': len(y_clean),
            'best_params': best_params
        }
        
        self.training_metrics[target] = metrics
        
        # Feature importance
        importance_dict = dict(zip(X_scaled.columns, model.feature_importances_))
        self.feature_importance[target] = dict(
            sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
        )
        
        print(f"✅ {target} model trained - RMSE: {metrics['cv_rmse']:.3f}")
        return model
    
    def train_all_targets(self, features_df, targets_df, optimize=True):
        """Train models for all targets."""
        print(f"🎯 Training {self.position} models...")
        
        X, y = self.prepare_data(features_df, targets_df)
        if X is None:
            print(f"❌ No data available for {self.position}")
            return
        
        print(f"📊 Training data: {len(X)} samples, {len(X.columns)} features")
        
        for target in self.target_columns:
            if target in y.columns:
                self.train_model(X, y, target, optimize)
        
        self.is_trained = True
        print(f"🎉 {self.position} training completed!")
    
    def predict(self, features_df):
        """Make predictions."""
        if not self.is_trained:
            raise ValueError("Model not trained")
        
        position_mask = features_df['position'] == self.position
        position_features = features_df[position_mask].copy()
        
        if position_features.empty:
            return pd.DataFrame()
        
        available_features = [col for col in self.feature_columns if col in position_features.columns]
        X = position_features[available_features].fillna(0)
        
        predictions = {'player_id': position_features['player_id'].values}
        
        for target, model in self.models.items():
            X_scaled = self.scalers[target].transform(X)
            pred = model.predict(X_scaled)
            predictions[f'pred_{target}'] = pred
        
        return pd.DataFrame(predictions)

print("✅ Base model class defined!")

In [None]:
# Cell 4: Position-Specific Model Configurations

# Define position-specific configurations
POSITION_CONFIGS = {
    'Goalkeeper': {
        'target_columns': ['total_points', 'clean_sheets', 'saves', 'bonus', 'minutes'],
        'feature_columns': [
            'form_points_5gw', 'form_points_3gw', 'form_minutes_5gw',
            'clean_sheet_probability', 'average_saves_per_game', 'penalty_save_rate',
            'fixture_difficulty', 'is_home', 'expected_goals_against',
            'price_rank_in_team', 'is_key_player', 'price', 'ownership_percentage'
        ],
        'point_weights': {'clean_sheets': 4, 'saves': 1/3, 'bonus': 1, 'minutes': 2/90}
    },
    
    'Defender': {
        'target_columns': ['total_points', 'clean_sheets', 'goals_scored', 'assists', 'bonus', 'minutes'],
        'feature_columns': [
            'form_points_5gw', 'form_goals_5gw', 'form_assists_5gw',
            'clean_sheet_probability', 'goal_scoring_rate', 'attacking_threat',
            'fixture_difficulty', 'is_home', 'expected_goals_against',
            'set_piece_likelihood', 'price_rank_in_team', 'price', 'ownership_percentage'
        ],
        'point_weights': {'clean_sheets': 4, 'goals_scored': 6, 'assists': 3, 'bonus': 1}
    },
    
    'Midfielder': {
        'target_columns': ['total_points', 'goals_scored', 'assists', 'bonus', 'minutes', 'clean_sheets'],
        'feature_columns': [
            'form_points_5gw', 'form_goals_5gw', 'form_assists_5gw',
            'goal_scoring_rate', 'assist_rate', 'creativity_index',
            'fixture_difficulty', 'is_home', 'expected_goals_for',
            'set_piece_likelihood', 'penalty_likelihood', 'price', 'ownership_percentage'
        ],
        'point_weights': {'goals_scored': 5, 'assists': 3, 'clean_sheets': 1, 'bonus': 1}
    },
    
    'Forward': {
        'target_columns': ['total_points', 'goals_scored', 'assists', 'bonus', 'minutes'],
        'feature_columns': [
            'form_points_5gw', 'form_goals_5gw', 'form_assists_5gw',
            'goal_scoring_rate', 'attacking_threat', 'ict_index',
            'fixture_difficulty', 'is_home', 'expected_goals_for',
            'penalty_likelihood', 'price_rank_in_team', 'price', 'ownership_percentage'
        ],
        'point_weights': {'goals_scored': 4, 'assists': 3, 'bonus': 1}
    }
}

def calculate_fpl_points(predictions, position):
    """Calculate FPL points from component predictions."""
    config = POSITION_CONFIGS[position]
    weights = config['point_weights']
    
    total_points = 0
    
    # Appearance points (2 for 60+ minutes)
    if 'pred_minutes' in predictions.columns:
        appearance_prob = np.clip(predictions['pred_minutes'] / 60, 0, 1)
        total_points += appearance_prob * 2
    
    # Position-specific points
    for component, weight in weights.items():
        pred_col = f'pred_{component}'
        if pred_col in predictions.columns:
            total_points += predictions[pred_col] * weight
    
    return np.maximum(total_points, 0)  # Ensure non-negative

print("✅ Position configurations defined!")
print(f"📋 Positions: {list(POSITION_CONFIGS.keys())}")

# Display feature counts by position
for position, config in POSITION_CONFIGS.items():
    print(f"  {position}: {len(config['feature_columns'])} features, {len(config['target_columns'])} targets")

In [None]:
# Cell 5: Train All Position-Specific Models

print("🎯 Starting position-specific model training...")
print("⏱️ Estimated time: 20-30 minutes with optimization")
print("="*60)

# Check if data is available
if features_df.empty or targets_df.empty:
    print("❌ Required data not available. Please run previous notebooks first.")
else:
    # Initialize trained models dictionary
    trained_models = {}
    
    # Train models for each position
    for position, config in POSITION_CONFIGS.items():
        print(f"\n🎯 Training {position} Model")
        print("-" * 40)
        
        # Check if position has data
        position_data = features_df[features_df['position'] == position]
        if position_data.empty:
            print(f"⚠️ No {position} data found, skipping...")
            continue
        
        # Initialize position model
        model = FPLPositionModel(
            position=position,
            target_columns=config['target_columns'],
            feature_columns=config['feature_columns']
        )
        
        # Train the model (with optimization for demonstration)
        try:
            model.train_all_targets(features_df, targets_df, optimize=True)
            trained_models[position] = model
            
            # Display training results
            print(f"\n📊 {position} Training Results:")
            for target, metrics in model.training_metrics.items():
                print(f"  {target}: RMSE={metrics['cv_rmse']:.3f}, Samples={metrics['n_samples']}")
                
        except Exception as e:
            print(f"❌ Error training {position} model: {e}")
            continue
    
    print("\n" + "="*60)
    print(f"🎉 Model training completed!")
    print(f"✅ Successfully trained: {list(trained_models.keys())}")
    
    # Save training summary
    training_summary = {
        'timestamp': datetime.now().isoformat(),
        'trained_positions': list(trained_models.keys()),
        'total_models': sum(len(model.models) for model in trained_models.values()),
        'performance_summary': {}
    }
    
    for position, model in trained_models.items():
        training_summary['performance_summary'][position] = {
            target: metrics['cv_rmse'] for target, metrics in model.training_metrics.items()
        }
    
    # Display performance summary
    print("\n📈 Performance Summary:")
    for position, performance in training_summary['performance_summary'].items():
        avg_rmse = np.mean(list(performance.values()))
        print(f"  {position}: Avg RMSE = {avg_rmse:.3f}")

In [None]:
# Cell 6: Model Evaluation and Feature Importance Analysis

print("📊 Analyzing model performance and feature importance...")

if not trained_models:
    print("❌ No trained models available for evaluation")
else:
    # Create comprehensive evaluation plots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=['Model Performance by Position', 'Feature Importance - Goalkeeper', 
                       'Feature Importance - Midfielder', 'Feature Importance - Forward'],
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )
    
    # Performance comparison
    positions = []
    avg_rmse = []
    
    for position, model in trained_models.items():
        if model.training_metrics:
            rmse_values = [metrics['cv_rmse'] for metrics in model.training_metrics.values()]
            positions.append(position)
            avg_rmse.append(np.mean(rmse_values))
    
    fig.add_trace(
        go.Bar(x=positions, y=avg_rmse, name="Average RMSE",
               text=[f"{val:.3f}" for val in avg_rmse], textposition="outside"),
        row=1, col=1
    )
    
    # Feature importance for key positions
    plot_positions = ['Goalkeeper', 'Midfielder', 'Forward']
    plot_coords = [(1, 2), (2, 1), (2, 2)]
    
    for i, position in enumerate(plot_positions):
        if position in trained_models:
            model = trained_models[position]
            
            # Get feature importance for primary target (total_points)
            if 'total_points' in model.feature_importance:
                importance = model.feature_importance['total_points']
                top_features = list(importance.keys())[:8]  # Top 8 features
                importance_values = [importance[feat] for feat in top_features]
                
                row, col = plot_coords[i]
                fig.add_trace(
                    go.Bar(x=importance_values, y=top_features, orientation='h',
                           name=f"{position} Features", showlegend=False),
                    row=row, col=col
                )
    
    fig.update_layout(height=800, title_text="FPL Model Evaluation Dashboard")
    fig.show()
    
    # Detailed performance metrics table
    print("\n📋 Detailed Performance Metrics:")
    print("=" * 80)
    
    performance_data = []
    for position, model in trained_models.items():
        for target, metrics in model.training_metrics.items():
            performance_data.append({
                'Position': position,
                'Target': target,
                'CV_RMSE': f"{metrics['cv_rmse']:.3f}",
                'Samples': metrics['n_samples'],
                'Features': len(model.feature_columns)
            })
    
    performance_df = pd.DataFrame(performance_data)
    print(performance_df.to_string(index=False))
    
    # Feature importance summary
    print("\n🔝 Top Features by Position:")
    print("=" * 50)
    
    for position, model in trained_models.items():
        if 'total_points' in model.feature_importance:
            top_features = list(model.feature_importance['total_points'].keys())[:5]
            print(f"\n{position}:")
            for i, feature in enumerate(top_features, 1):
                importance_score = model.feature_importance['total_points'][feature]
                print(f"  {i}. {feature}: {importance_score:.3f}")

In [None]:
# Cell 7: Save Trained Models and Create Ensemble

print("💾 Saving trained models to Google Drive...")

if not trained_models:
    print("❌ No trained models to save")
else:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save individual position models
    saved_models = {}
    for position, model in trained_models.items():
        model_file = f"models/trained_models/{position.lower()}/{timestamp}_{position.lower()}_model.joblib"
        
        # Prepare model data for saving
        model_data = {
            'position': position,
            'models': model.models,
            'scalers': model.scalers,
            'feature_importance': model.feature_importance,
            'training_metrics': model.training_metrics,
            'target_columns': model.target_columns,
            'feature_columns': model.feature_columns,
            'timestamp': timestamp
        }
        
        joblib.dump(model_data, model_file)
        saved_models[position] = model_file
        print(f"✅ {position} model saved: {model_file}")
    
    # Create ensemble predictor class
    class FPLEnsemblePredictor:
        """Ensemble predictor combining all position models."""
        
        def __init__(self, model_files):
            self.position_models = {}
            self.load_models(model_files)
        
        def load_models(self, model_files):
            """Load all position models."""
            for position, file_path in model_files.items():
                model_data = joblib.load(file_path)
                self.position_models[position] = model_data
                print(f"📂 Loaded {position} model")
        
        def predict_gameweek(self, features_df, gameweek=None):
            """Predict points for all players for a gameweek."""
            all_predictions = []
            
            for position, model_data in self.position_models.items():
                # Filter features for this position
                position_features = features_df[features_df['position'] == position].copy()
                
                if position_features.empty:
                    continue
                
                # Prepare features
                available_features = [col for col in model_data['feature_columns'] 
                                    if col in position_features.columns]
                X = position_features[available_features].fillna(0)
                
                # Make predictions for each target
                predictions = {
                    'player_id': position_features['player_id'].values,
                    'position': position,
                    'gameweek': gameweek or position_features.get('gameweek', 0).iloc[0]
                }
                
                for target, model in model_data['models'].items():
                    scaler = model_data['scalers'][target]
                    X_scaled = scaler.transform(X)
                    pred = model.predict(X_scaled)
                    predictions[f'pred_{target}'] = pred
                
                # Calculate total FPL points
                pred_df = pd.DataFrame(predictions)
                pred_df['predicted_points'] = calculate_fpl_points(pred_df, position)
                
                all_predictions.append(pred_df)
            
            return pd.concat(all_predictions, ignore_index=True) if all_predictions else pd.DataFrame()
        
        def get_top_players(self, features_df, position=None, top_n=10):
            """Get top predicted players."""
            predictions = self.predict_gameweek(features_df)
            
            if position:
                predictions = predictions[predictions['position'] == position]
            
            return predictions.nlargest(top_n, 'predicted_points')
    
    # Create and save ensemble predictor
    ensemble = FPLEnsemblePredictor(saved_models)
    ensemble_file = f"models/ensemble/{timestamp}_fpl_ensemble.joblib"
    joblib.dump(saved_models, ensemble_file)  # Save model file paths
    
    print(f"\n🎯 Ensemble predictor created and saved: {ensemble_file}")
    
    # Test ensemble on sample data
    print("\n🧪 Testing ensemble predictor...")
    sample_features = features_df.head(20)  # Test on first 20 players
    sample_predictions = ensemble.predict_gameweek(sample_features, gameweek=1)
    
    if not sample_predictions.empty:
        print("✅ Ensemble test successful!")
        print(f"📊 Sample predictions shape: {sample_predictions.shape}")
        
        # Show top predictions by position
        print("\n🏆 Top Predicted Players by Position:")
        for position in ['Goalkeeper', 'Defender', 'Midfielder', 'Forward']:
            pos_preds = sample_predictions[sample_predictions['position'] == position]
            if not pos_preds.empty:
                top_player = pos_preds.nlargest(1, 'predicted_points')
                if not top_player.empty:
                    player_id = top_player['player_id'].iloc[0]
                    points = top_player['predicted_points'].iloc[0]
                    print(f"  {position}: Player {player_id} - {points:.2f} points")
    else:
        print("⚠️ Ensemble test returned no predictions")
    
    # Create final model summary
    model_summary = {
        'timestamp': timestamp,
        'positions_trained': list(saved_models.keys()),
        'total_individual_models': sum(len(model.models) for model in trained_models.values()),
        'ensemble_file': ensemble_file,
        'model_files': saved_models,
        'next_steps': [
            "Create prediction dashboard",
            "Implement backtesting",
            "Add real-time data updates",
            "Deploy to production"
        ]
    }
    
    # Save summary
    summary_file = f"models/{timestamp}_model_summary.json"
    with open(summary_file, 'w') as f:
        json.dump(model_summary, f, indent=2)
    
    print(f"\n📋 Model summary saved: {summary_file}")
    
    print("\n" + "="*60)
    print("🎉 MODEL TRAINING COMPLETE!")
    print("🚀 Ready for prediction dashboard and backtesting!")
    print(f"📁 All models saved in: {os.getcwd()}/models/")
    print("="*60)