In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
def load_and_concatenate_data(folder_path):
    # List to hold individual dataframes
    data_frames = []
    
    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith("_pfr_fantasy.csv"):
            # Extract year from file name
            year_suffix = int(file_name[:2])
            
            # Determine the full year
            if year_suffix < 24:
                year = 2000 + year_suffix
            else:
                year = 1900 + year_suffix
            
            # Construct full file path
            file_path = os.path.join(folder_path, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Add a 'year' column
            df['Year'] = year
            
            # Append the DataFrame to the list
            data_frames.append(df)
    
    # Concatenate all DataFrames
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df


In [3]:
def preprocess_data(df):
    # Define numerical and categorical columns
    numerical_cols = [
        'Age', 'G-Games', 'GS-Games', 'Cmp-Passing', 'Att-Passing', 'Yds-Passing', 
        'TD-Passing', 'Int-Passing', 'Att-Rushing', 'Yds-Rushing', 'Y/A-Rushing', 
        'TD-Rushing', 'Tgt-Receiving', 'Rec-Receiving', 'Yds-Receiving', 
        'Y/R-Receiving', 'TD-Receiving', 'Fmb-Fumbles', 'FL-Fumbles', 
        'TD-Scoring', '2PM-Scoring', '2PP-Scoring'
    ]
    categorical_cols = ['FantPos', 'Tm']
    
    # Preprocessing for numerical data: Fill missing values with zero and standardize
    numerical_transformer = Pipeline(steps=[
        ('fillna', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing for categorical data: Fill missing values with a placeholder and one-hot encode
    categorical_transformer = Pipeline(steps=[
        ('fillna', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Fit and transform the data
    df_preprocessed = preprocessor.fit_transform(df)
    
    # Convert to DataFrame
    df_preprocessed = pd.DataFrame(
        df_preprocessed, 
        columns=numerical_cols + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols))
    )
    
    # Add back non-transformed columns
    df_preprocessed['Year'] = df['Year'].values
    df_preprocessed['Player'] = df['Player'].values
    df_preprocessed['PlayerCode'] = df['PlayerCode'].values
    df_preprocessed['Rk'] = df['Rk'].values
    
    return df_preprocessed, preprocessor


In [4]:
# def create_player_sequences(df, target_columns):
#     # Sort data by PlayerCode and Year to ensure chronological order
#     df_sorted = df.sort_values(by=['PlayerCode', 'Year'])
    
#     sequences = []
#     targets = []
#     player_codes = df_sorted['PlayerCode'].unique()
    
#     # Generate sequences for each player
#     for player_code in player_codes:
#         player_data = df_sorted[df_sorted['PlayerCode'] == player_code]
#         if len(player_data) < 2:
#             continue
        
#         # Use all available seasons for each player
#         X_seq = player_data.drop(columns=['Year', 'Player', 'PlayerCode', 'Rk'] + target_columns).values
#         y_seq = player_data[target_columns].values
        
#         sequences.append(X_seq)
#         targets.append(y_seq)
    
#     return np.array(sequences), np.array(targets)


In [5]:
# def create_loocv_datasets(sequences, targets):
#     datasets = []
#     num_players = len(sequences)
    
#     for i in range(num_players):
#         # Create test set using the i-th player's sequence
#         X_test = sequences[i]
#         y_test = targets[i]
        
#         # Create training set using all other players' sequences
#         X_train = np.concatenate([sequences[j] for j in range(num_players) if j != i], axis=0)
#         y_train = np.concatenate([targets[j] for j in range(num_players) if j != i], axis=0)
        
#         # Create TensorFlow datasets
#         train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
#         test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1)
        
#         datasets.append((train_dataset, test_dataset))
    
#     return datasets


In [6]:
# # Load and concatenate data
# folder_path = '/Users/harrisonward/Desktop/CS/Git/final_fantasy/data'
# combined_df = load_and_concatenate_data(folder_path)

# # Define the target columns
# target_columns = [
#     'Yds-Passing', 'TD-Passing', 'Int-Passing', 'Yds-Rushing', 'TD-Rushing', 
#     'Tgt-Receiving', 'Rec-Receiving', 'Yds-Receiving', 'TD-Receiving', 'Fmb-Fumbles'
# ]

# # Preprocess the data
# df_preprocessed, preprocessor = preprocess_data(combined_df)

# # Add the Year, Player, PlayerCode, and Rk columns back to the preprocessed DataFrame
# df_preprocessed['Year'] = combined_df['Year']
# df_preprocessed['Player'] = combined_df['Player']
# df_preprocessed['PlayerCode'] = combined_df['PlayerCode']
# df_preprocessed['Rk'] = combined_df['Rk']

# # Create player sequences
# sequences, targets = create_player_sequences(df_preprocessed, target_columns)

# # Create LOOCV datasets
# datasets = create_loocv_datasets(sequences, targets)

# # Define the model
# model = tf.keras.Sequential([
#     tf.keras.layers.LSTM(64, activation='relu', input_shape=(None, sequences.shape[-1])),
#     tf.keras.layers.Dense(targets.shape[-1])
# ])

# model.compile(optimizer='adam', loss='mse')

# # Train and evaluate with LOOCV
# mean_squared_errors = []
# for train_dataset, test_dataset in datasets:
#     model.fit(train_dataset, epochs=10, verbose=0)
    
#     for X_test, y_test in test_dataset:
#         y_pred = model.predict(X_test)
#         mse = tf.keras.losses.mean_squared_error(y_test, y_pred)
#         mean_squared_errors.append(tf.reduce_mean(mse).numpy())

# # Print the average mean squared error
# print(f'Average Mean Squared Error: {np.mean(mean_squared_errors)}')

NEW SPEC

In [7]:
def create_sequences_and_targets(df, target_columns, sequence_length):
    # Sort data by PlayerCode and Year to ensure chronological order
    df_sorted = df.sort_values(by=['PlayerCode', 'Year'])
    
    sequences = []
    targets = []
    player_codes = df_sorted['PlayerCode'].unique()
    
    for player_code in player_codes:
        player_data = df_sorted[df_sorted['PlayerCode'] == player_code]
        
        # Ensure we have enough data to create at least one sequence
        if len(player_data) <= sequence_length:
            continue
        
        # Create sequences and corresponding targets
        for i in range(len(player_data) - sequence_length):
            X_seq = player_data.iloc[i:i+sequence_length].drop(columns=['Year', 'Player', 'PlayerCode', 'Rk'] + target_columns).values
            y_target = player_data.iloc[i+sequence_length][target_columns].values
            
            sequences.append(X_seq)
            targets.append(y_target)
    
    # Pad sequences to have the same length
    sequences_padded = pad_sequences(sequences, padding='post', dtype='float32')
    targets = np.array(targets, dtype='float32')
    
    return np.array(sequences_padded), targets

def create_loocv_datasets(sequences, targets):
    datasets = []
    num_samples = len(sequences)
    
    for i in range(num_samples):
        # Create test set using the i-th sample's sequence
        X_test = sequences[i]
        y_test = targets[i]
        
        # Expand dims to ensure the correct shape (1, sequence_length, num_features)
        X_test = np.expand_dims(X_test, axis=0)
        y_test = np.expand_dims(y_test, axis=0)
        
        # Create training set using all other samples' sequences
        X_train = np.delete(sequences, i, axis=0)
        y_train = np.delete(targets, i, axis=0)
        
        # Shuffle the training set
        indices = np.arange(X_train.shape[0])
        np.random.shuffle(indices)
        X_train = X_train[indices]
        y_train = y_train[indices]
        
        # Create TensorFlow datasets
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=len(X_train)).batch(32)
        test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1)
        
        datasets.append((train_dataset, test_dataset))
    
    return datasets


In [8]:
# Load and concatenate data
folder_path = '/Users/harrisonward/Desktop/CS/Git/final_fantasy/data'
combined_df = load_and_concatenate_data(folder_path)

# Define the target columns
target_columns = [
    'Yds-Passing', 'TD-Passing', 'Int-Passing', 'Yds-Rushing', 'TD-Rushing', 
    'Tgt-Receiving', 'Rec-Receiving', 'Yds-Receiving', 'TD-Receiving', 'Fmb-Fumbles'
]

# Preprocess the data
df_preprocessed, preprocessor = preprocess_data(combined_df)

# Add the Year, Player, PlayerCode, and Rk columns back to the preprocessed DataFrame
df_preprocessed['Year'] = combined_df['Year']
df_preprocessed['Player'] = combined_df['Player']
df_preprocessed['PlayerCode'] = combined_df['PlayerCode']
df_preprocessed['Rk'] = combined_df['Rk']

# Create sequences and targets with padding
sequence_length = 3  # Example sequence length
sequences, targets = create_sequences_and_targets(df_preprocessed, target_columns, sequence_length)

# Create LOOCV datasets with shuffling
datasets = create_loocv_datasets(sequences, targets)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Masking(mask_value=0.0, input_shape=(sequence_length, sequences.shape[-1])),
    # tf.keras.layers.Conv1D(filters=32, kernel_size=2, activation='relu'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(targets.shape[-1])
])

model.compile(optimizer='adam', loss='mse')

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train and evaluate with LOOCV
mean_squared_errors = []
for train_dataset, test_dataset in datasets:
    history = model.fit(train_dataset, epochs=20, verbose=1, validation_data=test_dataset, callbacks=[early_stopping])
    
    for X_test, y_test in test_dataset:
        y_pred = model.predict(X_test)
        mse = tf.keras.losses.mean_squared_error(y_test, y_pred)
        mean_squared_errors.append(tf.reduce_mean(mse).numpy())

# Print the average mean squared error
print(f'Average Mean Squared Error: {np.mean(mean_squared_errors)}')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 1/20
Epo

In [None]:
# plot the training data from our model 
def plot_training_history(history):
    loss = history.history['loss']
    try:
        val_loss = history.history['val_loss']
        vd = True
    except(KeyError):
        vd = False
    epochs = range(1,len(loss)+1)

    plt.figure()
    plt.plot(epochs, loss, linewidth=1.5, label='Training loss')
    if vd:
        plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.legend()
    plt.show()

plot_training_history(history)