In [2]:
# data
import nfl_data_py as nfl

# data loading and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# models
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier, plot_importance

# interpretation
import shap
from interpret import show

# pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.feature_selection import RFECV, RFE
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, nan_euclidean_distances

pd.set_option('display.max_columns', None)  # None zeigt alle Spalten
pd.set_option('display.max_rows', None)  # Alle Zeilen anzeigen, vorsichtig bei großen DataFrames
pd.set_option('display.width', 1000)  # Breite anpassen


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
df_ids = nfl.import_ids()
df_weekly = nfl.import_weekly_data(list(range(2018, 2025)))
df_seasonal = nfl.import_seasonal_data(list(range(2017,2024)))
df_schedule = nfl.import_schedules(list(range(2018, 2025)))
df_pass_pfr = nfl.import_weekly_pfr('pass', list(range(2018, 2025)))
df_rush_pfr = nfl.import_weekly_pfr('rush', list(range(2018, 2025)))
df_rec_pfr = nfl.import_weekly_pfr('rec', list(range(2018, 2025)))
df_pass_ngs = nfl.import_ngs_data('passing',list(range(2018, 2025)))
df_rush_ngs = nfl.import_ngs_data('rushing',list(range(2018, 2025)))
df_rec_ngs = nfl.import_ngs_data('receiving',list(range(2018, 2025)))
df_snap_counts = nfl.import_snap_counts(list(range(2018, 2025)))

df_weekly = df_weekly[(df_weekly['season_type'] == 'REG') & (df_weekly['position'].isin(['QB', 'WR', 'RB', 'TE']))]

df_weekly['game_id_home_away'] = df_weekly['season'].astype(str) + '_' + df_weekly['week'].apply(lambda x: f"{x:02d}")+'_'+df_weekly['recent_team']+'_'+df_weekly['opponent_team']
df_weekly['game_id_away_home'] = df_weekly['season'].astype(str) + '_' + df_weekly['week'].apply(lambda x: f"{x:02d}")+'_'+df_weekly['opponent_team']+'_'+df_weekly['recent_team']

df_ids = df_ids.rename(columns={'gsis_id': 'player_id', 'pfr_id': 'pfr_player_id'})
df_pass_ngs = df_pass_ngs.rename(columns={'player_gsis_id': 'player_id'})
df_rush_ngs = df_rush_ngs.rename(columns={'player_gsis_id': 'player_id'})
df_rec_ngs = df_rec_ngs.rename(columns={'player_gsis_id': 'player_id'})

df_seasonal['season'] = df_seasonal['season'] + 1

df_merged = pd.melt(
    df_weekly,
    id_vars=['player_id', 'position', 'season', 'week', 'recent_team', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'passing_2pt_conversions', 'interceptions', 'sack_fumbles_lost', 'sacks', 'sack_yards', 'passing_air_yards', 'passing_epa', 'pacr', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_2pt_conversions', 'rushing_fumbles_lost', 'rushing_epa', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_2pt_conversions', 'receiving_fumbles_lost', 'racr', 'wopr', 'receiving_epa', 'fantasy_points'],
    value_vars=['game_id_home_away', 'game_id_away_home'],
    var_name='game_id_type',
    value_name='game_id'
)

df_merged = pd.merge(df_merged, df_ids[['player_id', 'pfr_player_id', 'draft_pick', 'draft_year']], on = 'player_id', how = 'inner')
df_merged = pd.merge(df_merged, df_seasonal[['player_id', 'season', 'dom']], on = ['player_id', 'season'], how = 'left')
df_merged = pd.merge(df_merged, df_schedule[['game_id', 'home_team']], on='game_id', how='inner')
df_merged = pd.merge(df_merged, df_pass_pfr[['pfr_player_id', 'season', 'week', 'passing_bad_throws', 'times_pressured']], on = ['pfr_player_id', 'season', 'week'], how = 'left')
df_merged = pd.merge(df_merged, df_rec_pfr[['pfr_player_id', 'season', 'week', 'receiving_rat']], on = ['pfr_player_id', 'season', 'week'], how = 'left')
df_merged = pd.merge(df_merged, df_rush_pfr[['pfr_player_id', 'season', 'week', 'rushing_broken_tackles']], on = ['pfr_player_id', 'season', 'week'], how = 'left')
df_merged = pd.merge(df_merged, df_pass_ngs[['player_id', 'season', 'week', 'passer_rating', 'aggressiveness']], on = ['player_id', 'season', 'week'], how = 'left')
df_merged = pd.merge(df_merged, df_rec_ngs[['player_id', 'season', 'week', 'catch_percentage']], on = ['player_id', 'season', 'week'], how = 'left')
df_merged = pd.merge(df_merged, df_rush_ngs[['player_id', 'season', 'week', 'efficiency']], on = ['player_id', 'season', 'week'], how = 'left')
df_merged = pd.merge(df_merged, df_snap_counts[['pfr_player_id', 'season', 'week', 'offense_snaps']], on = ['pfr_player_id', 'season', 'week'], how = 'left')

df_merged = df_merged.drop(columns=['game_id', 'game_id_type', 'pfr_player_id'])

df_merged.info()

Downcasting floats.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35195 entries, 0 to 35194
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   player_id                  35195 non-null  object 
 1   position                   35195 non-null  object 
 2   season                     35195 non-null  int32  
 3   week                       35195 non-null  int32  
 4   recent_team                35195 non-null  object 
 5   opponent_team              35195 non-null  object 
 6   completions                35195 non-null  int32  
 7   attempts                   35195 non-null  int32  
 8   passing_yards              35195 non-null  float32
 9   passing_tds                35195 non-null  int32  
 10  passing_2pt_conversions    35195 non-null  int32  
 11  interceptions              35195 non-null  float32
 12  sack_fumbles_lost          35195 non-null  int32  
 13  sacks                     

In [13]:
df_merged = df_merged.fillna(0)

df_merged['rookie_flag'] = (df_merged['season'] == df_merged['draft_year']).astype(int)
df_merged['last_season_data_flag'] = (df_merged['week'] < 6).astype(int)

recent_team_means = df_merged.groupby(['recent_team', 'season', 'week'])['fantasy_points'].mean().reset_index()
opponent_team_means = df_merged.groupby(['opponent_team', 'season', 'week'])['fantasy_points'].mean().reset_index()
position_means = df_merged.groupby(['position', 'season', 'week'])['fantasy_points'].mean().reset_index()

recent_team_means.rename(columns={'fantasy_points': 'recent_team_encoded'}, inplace=True)
opponent_team_means.rename(columns={'fantasy_points': 'opponent_team_encoded'}, inplace=True)
position_means.rename(columns={'fantasy_points': 'position_encoded'}, inplace=True)

df_merged = pd.merge(df_merged, recent_team_means, on=['recent_team', 'season', 'week'], how='left')
df_merged = pd.merge(df_merged, opponent_team_means, on=['opponent_team', 'season', 'week'], how='left')
df_merged = pd.merge(df_merged, position_means, on=['position', 'season', 'week'], how='left')

df_merged['turnover'] = (
    df_merged['interceptions'] +
    df_merged['sack_fumbles_lost'] +
    df_merged['rushing_fumbles_lost'] +
    df_merged['receiving_fumbles_lost']
)

df_merged['rushing_pts'] = (df_merged['rushing_tds'] * 6) + (df_merged['rushing_2pt_conversions'] * 2)
df_merged['receiving_pts'] = (df_merged['receiving_tds'] * 6) + (df_merged['receiving_2pt_conversions'] * 2)
df_merged['passing_pts'] = (df_merged['passing_tds'] * 6) + (df_merged['passing_2pt_conversions'] * 2)

df_merged['epa_total'] = df_merged['passing_epa'] + df_merged['rushing_epa'] + df_merged['receiving_epa']

df_merged = df_merged.drop(columns=['draft_year', 'interceptions', 'sack_fumbles_lost', 'rushing_fumbles_lost', 'receiving_fumbles_lost', 
                                    'rushing_tds', 'rushing_2pt_conversions', 'receiving_tds', 'receiving_2pt_conversions', 'passing_tds', 
                                    'passing_2pt_conversions', 'passing_epa', 'rushing_epa', 'receiving_epa'])

In [14]:
# Liste der Spalten, für die Rolling-Features erstellt werden sollen
columns_to_roll = ['completions', 'attempts', 'passing_yards', 'sacks', 'passer_rating', 'aggressiveness', 'catch_percentage', 'efficiency',
                   'sack_yards', 'passing_air_yards', 'pacr', 'carries', 'offense_snaps', 'recent_team_encoded', 'opponent_team_encoded', 
                   'rushing_yards', 'receptions', 'targets', 'receiving_yards', 'racr', 'wopr', 'fantasy_points', 'passing_bad_throws', 
                   'times_pressured', 'position_encoded', 'epa_total', 'receiving_rat', 'rushing_broken_tackles', 'turnover', 'rushing_pts', 
                   'receiving_pts', 'passing_pts']

# Funktion zum Erstellen von Rolling-Features
def create_rolling_features(df):

    # Sortiere nach player_id, season und week
    df = df.sort_values(by=['player_id', 'season', 'week']).reset_index(drop=True)

    df['cnt_games_over_20ffpts_l5w'] = (
        df.groupby('player_id')['fantasy_points']
        .apply(lambda x: x.shift(1).rolling(window=5, min_periods=5).apply(lambda y: (y > 20).sum()))
        .reset_index(level=0, drop=True)
    )

    # Rolling-Features erstellen
    for col in columns_to_roll:

        feature_name_1 = f"ewm_{col}_l5w"
        df[feature_name_1] = (
            df.groupby('player_id')[col]
            .apply(lambda x: x.shift(1).ewm(span=5, min_periods=5).mean())
            .reset_index(level=0, drop=True)
        )

        for metric in ['mean', 'median', 'std']:
            feature_name_2 = f"{metric}_{col}_l5w"
            rolling_result_5w = (
                df.groupby('player_id')[col]
                  .apply(lambda x: x.shift(1).rolling(window=5, min_periods=5).agg(metric))  # shift(1) schließt aktuelle Woche aus
                  .reset_index(level=0, drop=True)  # Index zurücksetzen
            )
            # Einfügen der Rolling-Metrik
            df[feature_name_2] = rolling_result_5w

        for metric in ['max', 'min']:
            feature_name_3 = f"{metric}_{col}_l3w"
            # Berechnung der Rolling-Metrik (ohne aktuelle Woche)
            rolling_result_3w = (
                df.groupby('player_id')[col]
                  .apply(lambda x: x.shift(1).rolling(window=3, min_periods=3).agg(metric))  # shift(1) schließt aktuelle Woche aus
                  .reset_index(level=0, drop=True)  # Index zurücksetzen
            )
            # Einfügen der Rolling-Metrik
            df[feature_name_3] = rolling_result_3w

    return df


# Anwendung auf df_merged
df_merged = create_rolling_features(df_merged)

df_merged.info()

  df[feature_name_3] = rolling_result_3w
  df[feature_name_1] = (
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_1] = (
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_1] = (
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_1] = (
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_2] = rolling_result_5w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_3] = rolling_result_3w
  df[feature_name_1] = (
  df[feature_n

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35195 entries, 0 to 35194
Columns: 236 entries, player_id to min_passing_pts_l3w
dtypes: float32(14), float64(205), int32(12), object(5)
memory usage: 59.9+ MB


  df[feature_name_3] = rolling_result_3w


In [15]:
df_merged = df_merged.dropna()

df_merged = df_merged.drop(columns=['completions', 'attempts', 'passing_yards', 'sacks', 'sack_yards', 'passing_air_yards',  
                                    'pacr', 'carries', 'rushing_yards', 'receptions', 'targets', 'receiving_yards', 'racr', 
                                    'wopr', 'passing_bad_throws', 'times_pressured', 'receiving_rat', 'rushing_broken_tackles', 'turnover', 
                                    'rushing_pts', 'receiving_pts', 'passing_pts', 'home_team', 'passer_rating', 'aggressiveness',
                                    'catch_percentage', 'efficiency', 'offense_snaps', 'recent_team_encoded', 'opponent_team_encoded', 
                                    'position_encoded', 'recent_team', 'opponent_team', 'position', 'epa_total'])

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29633 entries, 5 to 35194
Columns: 201 entries, player_id to min_passing_pts_l3w
dtypes: float32(1), float64(195), int32(4), object(1)
memory usage: 45.1+ MB


In [16]:
df_merged = df_merged.sort_values(['player_id', 'season', 'week'])
df_merged['did_play'] = 1

player_seasons = df_merged[['player_id', 'season']].drop_duplicates()

all_weeks = []

for _, row in player_seasons.iterrows():
    # Assuming weeks go from 1 to 18 for NFL season
    weeks = pd.DataFrame({
        'player_id': row['player_id'],
        'season': row['season'],
        'week': range(1, 19),
    })
    all_weeks.append(weeks)
    
complete_weeks = pd.concat(all_weeks, ignore_index=True)

df_merged = pd.merge(
    complete_weeks,
    df_merged,
    on=['player_id', 'season', 'week'],
    how='left'
)

df_merged = df_merged.fillna(0)

In [17]:
df_seq = df_merged.copy()

df_seq['time_index'] = df_seq['season'] * 100 + df_seq['week']

df_seq = df_seq.sort_values(['player_id', 'time_index'])

### Train Model

In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

def create_sequences(df, sequence_length, feature_columns, target_column='fantasy_points'):
    """
    Create sequences for LSTM model from scaled data.
    """
    sequences = []
    targets = []

    # Sort the data by 'player_id' and a time column (optional for predictability)
    df = df.sort_values(by=['player_id', 'time_index'])
    
    # Group by player_id
    for player_id, player_data in df.groupby('player_id'):
        # Extract feature and target arrays
        player_features = player_data[feature_columns].values
        player_targets = player_data[target_column].values

        # Create sequences for this player
        for i in range(len(player_features) - sequence_length):
            sequences.append(player_features[i:i + sequence_length])
            targets.append(player_targets[i + sequence_length])

    return np.array(sequences), np.array(targets)


def prepare_data(df, feature_columns, target_column, sequence_length, train_cutoff):
    """
    Prepare data for LSTM model including scaling and sequence creation.
    """
    # Split into train and test
    train_df = df[df['time_index'] <= train_cutoff]
    test_df = df[df['time_index'] > train_cutoff]
    
    # Scale features
    scaler = StandardScaler()
    scaler.fit(train_df[feature_columns])
    
    train_scaled = pd.DataFrame(scaler.transform(train_df[feature_columns]), 
                                columns=feature_columns)
    test_scaled = pd.DataFrame(scaler.transform(test_df[feature_columns]), 
                               columns=feature_columns)
    
    # Add back non-feature columns (e.g., 'player_id' and 'time_index')
    train_scaled['player_id'] = train_df['player_id'].values
    train_scaled['time_index'] = train_df['time_index'].values
    train_scaled[target_column] = train_df[target_column].values

    test_scaled['player_id'] = test_df['player_id'].values
    test_scaled['time_index'] = test_df['time_index'].values
    test_scaled[target_column] = test_df[target_column].values

    # Create sequences using scaled data
    X_train, y_train = create_sequences(
        train_scaled,
        sequence_length=sequence_length,
        feature_columns=feature_columns,
        target_column=target_column
    )
    
    X_test, y_test = create_sequences(
        test_scaled,
        sequence_length=sequence_length,
        feature_columns=feature_columns,
        target_column=target_column
    )
    
    return X_train, X_test, y_train, y_test, scaler


def create_model(sequence_length, n_features):
    """
    Create LSTM model architecture.
    """
    model = Sequential([
        LSTM(256, input_shape=(sequence_length, n_features), return_sequences=True),
        Dropout(0.4),
        LSTM(128),
        Dropout(0.4),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

# Get feature columns
def get_feature_columns(df):
    """
    Get list of feature columns excluding non-feature columns.
    """
    # TODO should 'time_index' or other time indicators be a feature or not?
    exclude_columns = [
        'player_id', 
        'season', 
        'week', 
        'fantasy_points'
    ]
    
    return [col for col in df.columns if col not in exclude_columns]

# Main execution
def train_lstm_model(df, target_column='fantasy_points'):
    """
    Main function to train LSTM model with optimized parameters.
    """
    # Set parameters
    sequence_length = 6
    train_cutoff = 202318
    epochs = 100
    batch_size = 64
    validation_split = 0.15
    
    # Get feature columns
    feature_columns = get_feature_columns(df)
    print(f"Number of features: {len(feature_columns)}")
    
    # Prepare data
    X_train, X_test, y_train, y_test, scaler = prepare_data(
        df, 
        feature_columns, 
        target_column,
        sequence_length, 
        train_cutoff
    )
    
    # Create model
    model = create_model(sequence_length, len(feature_columns))
    
    # Early stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        min_delta=0.001
    )
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_split=validation_split,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Evaluate model
    train_metrics = model.evaluate(X_train, y_train, verbose=0)
    test_metrics = model.evaluate(X_test, y_test, verbose=0)
    
    print("\nTraining Loss:", train_metrics[0])
    print("Training MAE:", train_metrics[1])
    print("Test Loss:", test_metrics[0])
    print("Test MAE:", test_metrics[1])
    
    return model, history, scaler, feature_columns

# Prediction function
def predict_next_week(player_data, model, scaler, sequence_length, feature_columns):
    """
    Predict next week's fantasy points for a player.
    """
    # Get last sequence_length weeks of data
    recent_data = player_data.tail(sequence_length)[feature_columns].values
    
    # Scale the data
    scaled_data = scaler.transform(recent_data)
    
    # Reshape for prediction
    X = scaled_data.reshape(1, sequence_length, len(feature_columns))
    
    # Make prediction
    prediction = model.predict(X)
    return prediction[0][0]

In [44]:
model, history, scaler, feature_columns = train_lstm_model(df_seq)

Number of features: 199


  super().__init__(**kwargs)


Epoch 1/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - loss: 43.3915 - mae: 4.6436 - val_loss: 39.3294 - val_mae: 4.5248
Epoch 2/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 44.1260 - mae: 4.8236 - val_loss: 39.7617 - val_mae: 4.3253
Epoch 3/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - loss: 42.6614 - mae: 4.6977 - val_loss: 39.3308 - val_mae: 4.5228
Epoch 4/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 42.5438 - mae: 4.7190 - val_loss: 39.4504 - val_mae: 4.4337
Epoch 5/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - loss: 42.2233 - mae: 4.7179 - val_loss: 39.3289 - val_mae: 4.6336
Epoch 6/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - loss: 42.3414 - mae: 4.7349 - val_loss: 39.3769 - val_mae: 4.4779
Epoch 7/100
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [45]:
# Example player data
specific_player_id = '00-0023459'
player_data = df_seq[df_seq['player_id'] == specific_player_id]

prediction = predict_next_week(player_data, model, scaler, sequence_length=6, feature_columns=feature_columns)
print(f"Predicted Fantasy Points for Player {specific_player_id}: {prediction}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
Predicted Fantasy Points for Player 00-0023459: 4.027355670928955


In [None]:
predc