## Setup

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from tqdm import tqdm
import seaborn as sns
import wandb
from wandb.keras import WandbCallback
import keras
from keras.models import Sequential

from pfutils import (get_test_data, get_train_data, get_pseudo_test_data, get_exponential_decay_lr_callback,
                     build_model, get_cosine_annealing_lr_callback, get_fold_indices, DataGenerator)

WANDB = True
SUBMIT = False
DATA_GENERATOR = True
TRAIN_ON_BACKWARD_WEEKS = False

#If TEST is False use this to simulate tractable testcases. Should be 0 if SUBMIT = True
PSEUDO_TEST_PATIENTS = 0

In [None]:
if SUBMIT:
    PSEUDO_TEST_PATIENTS = 0
    WANDB = False

In [None]:
if WANDB:    
    # retrieve W&B key
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb_key = user_secrets.get_secret("wandb_key")
    assert wandb_key, "Please create a key.txt or Kaggle Secret with your W&B API key"

    #wandb_key = "24020b558f39257d30a084a55cb438922c321495"

    !pip install -q --upgrade wandb
    !wandb login $wandb_key

## Settings And network

In [None]:
# Number of folds. A number between 1 and 176-PSEUDO_TEST_PATIENTS
FOLDS = 10

#Batch size
BATCH_SIZE = 128

#Amount of features inputted in NN
NUMBER_FEATURES = 9

#Hidden layers
HIDDEN_LAYERS = [64,64]

#State whether model should predict slope or single weeks
#Predicting the slope is making the assumption that the decrease is linear
PREDICT_SLOPE = False

#Gaussian Noise (the reported std error for FVC measurement devices is 70)
#All values range approximately from 0 to 1 except FVC which is between 0 and 6688
#0.01 change on Weeks corresponds to 1 week. Week_diff is changed accordingly
#NOISE_SDS : [Weeks, FVC, Percent, Age, Sex, CurrentlySmokes, Ex-smoker, Never Smoked]
NOISE_SDS = [0.05, 70, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
#GAUSSIAN_NOISE_CORRELATED is a boolean indicating if the gaussians added to X and y are perfectly correlated or independent
GAUSSIAN_NOISE_CORRELATED = True
                                     
#Activation function to use ('swish' or 'relu')
ACTIVATION_FUNCTION = 'swish'

#Experimenting with loss
MODIFIED_LOSS = True

#Dropout rate
DROP_OUT_RATE = 0
DROP_OUT_LAYERS = [] # [0,1,2] voor dropout in de eerste 3 lagen

#Train length
EPOCHS = 250

#L2-Regularization
L2_REGULARIZATION = False
REGULARIZATION_CONSTANT = 0.0001

#Input and/or output normalization
INPUT_NORMALIZATION = True
OUTPUT_NORMALIZATION = True

#Learning rate
LEARNING_RATE_SCHEDULER = 'exp' #'exp', 'cos' or None
MAX_LEARNING_RATE = 0.001
COSINE_CYCLES = 5
EPOCHS_PER_OOM_DECAY = 100 #OoM : Order of Magnitude

MODEL_NAME = "TestExpDecayAdam" 

config = dict(NUMBER_FEATURES = NUMBER_FEATURES, L2_REGULARIZATION = L2_REGULARIZATION, INPUT_NORMALIZATION = INPUT_NORMALIZATION,
              ACTIVATION_FUNCTION = ACTIVATION_FUNCTION, DROP_OUT_RATE = DROP_OUT_RATE, OUTPUT_NORMALIZATION = OUTPUT_NORMALIZATION,
              EPOCHS = EPOCHS, MAX_LEARNING_RATE = MAX_LEARNING_RATE, MODIFIED_LOSS = MODIFIED_LOSS, NOISE_SDS = NOISE_SDS,
              COSINE_CYCLES = COSINE_CYCLES, MODEL_NAME=MODEL_NAME, LEARNING_RATE_SCHEDULER = LEARNING_RATE_SCHEDULER, PREDICT_SLOPE = PREDICT_SLOPE,
              HIDDEN_LAYERS = HIDDEN_LAYERS, REGULARIZATION_CONSTANT = REGULARIZATION_CONSTANT, EPOCHS_PER_OOM_DECAY = EPOCHS_PER_OOM_DECAY,
              DROP_OUT_LAYERS = DROP_OUT_LAYERS, BATCH_SIZE = BATCH_SIZE, GAUSSIAN_NOISE_CORRELATED = GAUSSIAN_NOISE_CORRELATED )

In [None]:
if SUBMIT:
    test_data, submission = get_test_data("../input/osic-pulmonary-fibrosis-progression/test.csv", INPUT_NORMALIZATION)
    
train, data, labels = get_train_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS, INPUT_NORMALIZATION, TRAIN_ON_BACKWARD_WEEKS)

if PSEUDO_TEST_PATIENTS > 0:
    test_data, test_check = get_pseudo_test_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS, INPUT_NORMALIZATION)

In [None]:
model = build_model(config)
#tf.keras.utils.plot_model(model)
model.summary()

## Folds and Training

In [None]:
fold_pos = get_fold_indices(FOLDS, train)
print(fold_pos)

In [None]:
if DATA_GENERATOR:
    train_data = train[["Weeks", "FVC", "Percent", "Age", "Sex", 
                                 "Currently smokes", "Ex-smoker", "Never smoked", "Weekdiff_target"]]
    train_labels = labels
    np.save("train_data.npy", train_data.to_numpy())
    np.save("train_labels.npy", train_labels.to_numpy())

In [None]:
predictions = []

for fold in range(FOLDS):
    if DATA_GENERATOR:
        train_ID = list(range(fold_pos[0],fold_pos[fold])) + list(range(fold_pos[fold+1],len(train)))
        val_ID = list(range(fold_pos[fold], fold_pos[fold+1]))
        # Generators
        training_generator = DataGenerator(train_ID, config)
        validation_generator = DataGenerator(val_ID, config, validation = True)
    else:
        x_train = data["input_features"][:fold_pos[fold]].append(data["input_features"][fold_pos[fold+1]:])
        y_train = labels[:fold_pos[fold]].append(labels[fold_pos[fold+1]:])
        x_val = data["input_features"][fold_pos[fold]:fold_pos[fold+1]]
        y_val = labels[fold_pos[fold]:fold_pos[fold+1]]
    
    model = build_model(config)
    
    sv = tf.keras.callbacks.ModelCheckpoint(
    'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
    callbacks = [sv]
    if LEARNING_RATE_SCHEDULER == 'exp':
        callbacks.append(get_exponential_decay_lr_callback(config))
    if LEARNING_RATE_SCHEDULER == 'cos':
        callbacks.append(get_cosine_annealing_lr_callback(config))

    print(fold+1, "of", FOLDS)
    if WANDB:
        name = MODEL_NAME + '-F{}'.format(fold+1)
        config.update({'fold': fold+1})
        wandb.init(project="pulfib", name=name, config=config)
        wandb_cb = WandbCallback()
        callbacks.append(wandb_cb)
        
    if DATA_GENERATOR:
        history = model.fit(training_generator, validation_data = validation_generator, epochs = EPOCHS,
                            verbose = 0, callbacks = callbacks)
    else:
        history = model.fit(x_train, y_train, validation_data = (x_val,y_val), epochs = EPOCHS, verbose = 0, callbacks = callbacks)

    if SUBMIT or PSEUDO_TEST_PATIENTS > 0:
        model.load_weights('fold-%i.h5'%fold)
        predictions.append(model.predict(test_data, batch_size = 256))
    
    if WANDB:
        # finalize run
        wandb.join()

In [None]:
if SUBMIT:
    if PREDICT_SLOPE:
        predictions = np.mean(predictions,axis = 0)
        for i in range(1,len(test_data)+1):
            submission.loc[i,"FVC"] = test_data.loc[i-1,"FVC"] + predictions[i-1,0]*test_data.loc[i-1,"Weekdiff_target"]
            submission.loc[i, "Confidence"] = abs(predictions[i-1,1]*test_data.loc[i-1,"Weekdiff_target"])
    else:
        predictions = np.abs(predictions)
        predictions[:,:,1] = np.power(predictions[:,:,1],2)
        predictions = np.mean(predictions, axis = 0)
        predictions[:,1] = np.power(predictions[:,1],0.5)
        for i in range(1,len(test_data)+1):
            submission.loc[i,"FVC"] = predictions[i-1,0]
            submission.loc[i, "Confidence"] = predictions[i-1,1]
    submission.to_csv("submission.csv", index = False)

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import gmean
if PSEUDO_TEST_PATIENTS > 0:
    result = []
    for i in range(-20,20):
        postprocess = np.abs(predictions)
        if i == 0:
            postprocess[:,:,1] = gmean(postprocess[:,:,1], axis = 0)
            postprocess = np.mean(postprocess, axis = 0)
        else:
            postprocess[:,:,1] = np.power(postprocess[:,:,1],i)
            postprocess = np.mean(postprocess, axis = 0)
            postprocess[:,1] = np.power(postprocess[:,1],1/i)
        FVC_true = test_check["TargetFVC"].values
        FVC_pred = postprocess[:,0]
        sigma = postprocess[:,1]

        sigma_clip = np.maximum(np.abs(sigma), 70)
        delta = np.abs(FVC_true - FVC_pred)
        delta = np.minimum(delta, 1000)

        sq2 = np.sqrt(2)
        loss = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip * sq2)
        result.append(np.mean(loss))
    plt.plot(np.arange(-20,20),result)
