## Neural Networks for Longitudinal Data Analysis

# Setup

## Load/install packages

In [None]:
# ## Install/load pacman for help with package loading
# install.packages('pacman')
# library(pacman)

# ## Load/install packages
# pacman::p_load(
#     ## Needed for data pre-processing and modeling 
#     dplyr, tidyverse, readxl, padr, gridExtra, caret, tictoc, 

#     ## Data pre-processing only
#     rsample, lubridate, fastDummies, Lahman , 

#     ## Modeling only
#     keras, tensorflow  
#     )

In [None]:
library(tidyverse)
library(keras)
library(tensorflow)
library(Lahman)
library(padr)
library(lubridate)
library(fastDummies)
library(rsample)
library(readxl)
library(gridExtra)
library(caret)
library(tictoc)

## Set parameters

In [None]:
## Define random seed
random_seed = 100

## Filtering criteria for Baseball player example
min_plate_appearances = 85 # suggestion from Jimmy E. to require PA > 100 to focus on hitters
min_year = 1916 # earliest year to include so that stadium attendance is captured.

## Percents of data to include in training, testing, and validation dataframes
pct_train = .80
pct_test = .50 * (1 - pct_train)
pct_valid = pct_test

## Number of periods to look back (for attributes) and forward (for prediction)
look_back = 3
look_forward = 1
min_length = look_back + look_forward
#max_length = 20
#above_pct = 0.50


## Slash line outcomes
slash_line_outcomes = c('Batting_BattingAverage', 'Batting_OnBasePct', 'Batting_SlugPct') 
slash_line_outcomes_fmt = slash_line_outcomes %>% gsub('Batting_', '', .)

## Download files from Github

In [None]:
## Create a subfolder called 'R' to hold R scripts downloaded from demo Github repo
# dir.create('R')

In [None]:
# ## repo_path: direct path to Github repository
# ## filename: Name of file in the Github repo path
# get_file_from_github = function(repo_path, filename) {
#     url = paste0(repo_path, filename)
#     destfile = paste0('R/', filename)
#     download.file(url=url, destfile=destfile)
#     message(paste('Successfully downloaded', filename))
# }

In [None]:
# baseball_demo_repo_path = "https://raw.githubusercontent.com/sydeaka/neural_networks_longitudinal/master/R/"

# #get_file_from_github(repo_path = baseball_demo_repo_path, '00-config.R')
# get_file_from_github(repo_path = baseball_demo_repo_path, '01-data-processing_helper-functions.R')
# get_file_from_github(repo_path = baseball_demo_repo_path, '02-create_modeling_dataset.R')
# get_file_from_github(repo_path = baseball_demo_repo_path, '03-data-formatting.R')
# get_file_from_github(repo_path = baseball_demo_repo_path, '04-modeling-helper-functions.R')

## Source R scripts from Github

In [None]:
## Load helper functions used for data processing
source('R/01-data-processing_helper-functions.R')

In [None]:
## Create baseball dataset used in demo
source('R/02-create_modeling_dataset.R')

In [None]:
## Put data in formats required for modeling
source('R/03-data-formatting.R')

In [None]:
dat_train_scaled %>% head()

In [None]:
## Load modeling functions
## Example function assumes you would like to consider models that feature:
##  (1) one or more Gated Recurrent Unit (GRU) layers
##  (2) one or more fully connected dense layers
##  (3) a final output layer with one node for each outcome you want to predict
## Feel free to play around with other architectures, replace GRU with LSTM, etc.
source('R/04-modeling-helper-functions.R')

## Set modeling parameters

In [None]:
## Named list of optimizers to consider in the hyperparameter grid
optimizers = list(adam = optimizer_adam, rmsprop = optimizer_rmsprop, sgd = optimizer_sgd)

## If we loss function doesn't decrease by at least this amouunt, 
##  drop the learning rate and/or impose early stopping
min_delta = length(target_col_names) * 0.001

## Hyperparameter grid. Generally, we would include more than one value for each parameter
## `random_fit` will randomly select values from this grid and use them to fit a model
params = list(
    num_dense_layers = c(1),      # Number of fully connected dense layers
    num_recurrent_layers = c(1),  # Number of GRU layers
    batch_size = c(85, 90, 100),    # Batch size
    num_dense_units = c(900),  # Number of units to incldue in the dense layer
    optimizer_name = names(optimizers),  # Name of the optimizers to consider
    dense_activation = c('relu'),        # Activation to use in the dense layers
    learning_rate = c(.000001),  # Learning rates
    lstm_nodes = c(1500),                       # Number of nodes in the recurrent layes
    lstm_dropout = c(0.10),                     # Dropout for recurrent layers
    num_epochs = 10,                     # Number of epochs
    alpha = c(0.5),                      # Alpha to use for leaky relu
    patience_lr = 4,                     # Number of iterations in which loss is allowed to not decrease before automatically reducing learning rate
    max_lr_reductions = 10,              # Maximum number of times to reduce the learning rate
    lr_reduce_factor = 0.60,             # New learning rate is this fraction of the previous learning rate
    min_delta_lr = min_delta,            # See note for min_delta above
    min_delta_es = min_delta,            # See note for min_delta above
    loss_weights = list(1.0, 1.0, 1.0),  # Weights for outcome-specific losses used to create overall loss 
    l1_term = c(0.0001),                 # Regularization term for L-1 regularization
    l2_term = c(0.0)                     # Regularizationt erm for L-2 regularization
)

## Set up the leaderboard and output file name

In [None]:
testing = F
leader = NULL
model_output = list()
output_csv = ifelse(testing==T, 'output_test.csv', paste0('modeling_output_', Sys.time(), '.csv'))
cat('Results saved to', output_csv, '\n')
set.seed(random_seed)

## Fit the model(s)

In [None]:
# First run is testing to make sure it works
random_fit(params=params, testing=T)

In [None]:
random_fit(params=params)

In [None]:
leader