In [None]:
df_seq = df_merged.copy()

df_seq['time_index'] = df_seq['season'] * 100 + df_seq['week']

df_seq = df_seq.sort_values(['player_id', 'time_index'])

In [None]:
df_predict = df_seq[df_seq['season'] == 2024]
df_train = df_seq[df_seq['season'] < 2024]

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

def create_sequences(df, sequence_length, feature_columns, target_column='fantasy_points'):
    """
    Create sequences for LSTM model from scaled data.
    """
    sequences = []
    targets = []

    # Sort the data by 'player_id' and a time column (optional for predictability)
    df = df.sort_values(by=['player_id', 'time_index'])
    
    # Group by player_id
    for player_id, player_data in df.groupby('player_id'):
        # Extract feature and target arrays
        player_features = player_data[feature_columns].values
        player_targets = player_data[target_column].values

        # Create sequences for this player
        for i in range(len(player_features) - sequence_length):
            sequences.append(player_features[i:i + sequence_length])
            targets.append(player_targets[i + sequence_length])

    return np.array(sequences), np.array(targets)


def prepare_data(df, feature_columns, target_column, sequence_length, train_cutoff):
    """
    Prepare data for LSTM model including scaling and sequence creation.
    """
    # Split into train and test
    train_df = df[df['time_index'] <= train_cutoff]
    test_df = df[df['time_index'] > train_cutoff]
    
    # Scale features
    scaler = StandardScaler()
    scaler.fit(train_df[feature_columns])
    
    train_scaled = pd.DataFrame(scaler.transform(train_df[feature_columns]), 
                                columns=feature_columns)
    test_scaled = pd.DataFrame(scaler.transform(test_df[feature_columns]), 
                               columns=feature_columns)
    
    # Add back non-feature columns (e.g., 'player_id' and 'time_index')
    train_scaled['player_id'] = train_df['player_id'].values
    train_scaled['time_index'] = train_df['time_index'].values
    train_scaled[target_column] = train_df[target_column].values

    test_scaled['player_id'] = test_df['player_id'].values
    test_scaled['time_index'] = test_df['time_index'].values
    test_scaled[target_column] = test_df[target_column].values

    # Create sequences using scaled data
    X_train, y_train = create_sequences(
        train_scaled,
        sequence_length=sequence_length,
        feature_columns=feature_columns,
        target_column=target_column
    )
    
    X_test, y_test = create_sequences(
        test_scaled,
        sequence_length=sequence_length,
        feature_columns=feature_columns,
        target_column=target_column
    )
    
    return X_train, X_test, y_train, y_test, scaler


def create_model(sequence_length, n_features):
    """
    Create LSTM model architecture.
    """
    model = Sequential([
        LSTM(256, input_shape=(sequence_length, n_features), return_sequences=True),
        Dropout(0.4),
        LSTM(128),
        Dropout(0.4),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

# Get feature columns
def get_feature_columns(df):
    """
    Get list of feature columns excluding non-feature columns.
    """
    # TODO should 'time_index' or other time indicators be a feature or not?
    exclude_columns = [
        'player_id', 
        'season', 
        'week', 
        'fantasy_points'
    ]
    
    return [col for col in df.columns if col not in exclude_columns]

# Main execution
def train_lstm_model(df, target_column='fantasy_points'):
    """
    Main function to train LSTM model with optimized parameters.
    """
    # Set parameters
    sequence_length = 6
    train_cutoff = 202218
    epochs = 100
    batch_size = 64
    validation_split = 0.15
    
    # Get feature columns
    feature_columns = get_feature_columns(df)
    print(f"Number of features: {len(feature_columns)}")
    
    # Prepare data
    X_train, X_test, y_train, y_test, scaler = prepare_data(
        df, 
        feature_columns, 
        target_column,
        sequence_length, 
        train_cutoff
    )
    
    # Create model
    model = create_model(sequence_length, len(feature_columns))
    
    # Early stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        min_delta=0.001
    )
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_split=validation_split,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Evaluate model
    train_metrics = model.evaluate(X_train, y_train, verbose=0)
    test_metrics = model.evaluate(X_test, y_test, verbose=0)
    
    print("\nTraining Loss:", train_metrics[0])
    print("Training MAE:", train_metrics[1])
    print("Test Loss:", test_metrics[0])
    print("Test MAE:", test_metrics[1])
    
    return model, history, scaler, feature_columns

In [None]:
model, history, scaler, feature_columns = train_lstm_model(df_train)

In [None]:
def predict_2024_season(df, model, scaler, feature_columns, sequence_length=6):
    """
    Make predictions for 2024 season using only historical data through 2023.
    
    Parameters:
    -----------
    df: DataFrame with data from 2018-2023
    model: trained LSTM model
    scaler: fitted StandardScaler
    feature_columns: list of feature columns
    sequence_length: number of weeks to use for prediction
    
    Returns:
    --------
    DataFrame with predictions for all players for week 1 of 2024
    """
    predictions = []
    
    # Get all unique players from 2023 (or who played in the last year of your data)
    recent_players = df[df['season'] == 2023]['player_id'].unique()
    
    # For each player
    for player_id in recent_players:
        # Get player's historical data
        player_data = df[
            df['player_id'] == player_id
        ].sort_values('time_index')
        
        # Get the last sequence_length weeks
        last_sequence = player_data.tail(sequence_length)
        
        # Only predict if we have enough historical data
        if len(last_sequence) == sequence_length:
            # Get feature values
            recent_data = last_sequence[feature_columns].values
            
            # Scale the data
            scaled_data = scaler.transform(recent_data)
            
            # Reshape for prediction
            X = scaled_data.reshape(1, sequence_length, len(feature_columns))
            
            # Make prediction
            prediction = model.predict(X, verbose=0)[0][0]
            
            # Store prediction
            predictions.append({
                'player_id': player_id,
                'week': 1,  # Predicting week 1 of 2024
                'season': 2024,
                'predicted_fantasy_points': prediction
            })
    
    # Convert predictions to DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    return predictions_df