## Setup

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from tqdm import tqdm
import seaborn as sns
import wandb
from wandb.keras import WandbCallback
import keras
from keras.models import Sequential

from pfutils import (get_test_data, get_train_data, get_pseudo_test_data,
                     build_model, get_cosine_annealing_lr_callback, get_fold_indices)

WANDB = False
TEST = False
DATA_GENERATOR = False

#If TEST is False use this to simulate tractable testcases. Should be 0 if TEST = True
PSEUDO_TEST_PATIENTS = 26
if TEST:
    PSEUDO_TEST_PATIENTS = 0

In [None]:
# retrieve W&B key
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_key = user_secrets.get_secret("wandb")
# assert wandb_key, "Please create a key.txt or Kaggle Secret with your W&B API key"

wandb_key = "ea9b3c785541508ffdd795f2a706df065df389e3"

!pip install -q --upgrade wandb
!wandb login $wandb_key

## Preparing the Data

In [None]:
if TEST:
    test_data, submission = get_test_data("../input/osic-pulmonary-fibrosis-progression/test.csv")
    
train, data, labels = get_train_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS)

if PSEUDO_TEST_PATIENTS > 0:
    test_data, test_check = get_pseudo_test_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS)

In [None]:
## !!!! https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

# Dit is voor data generation on the fly voor bijv gaussian noise

## Settings And network

In [None]:
# Number of folds. A number between 1 and 176-PSEUDO_TEST_PATIENTS
FOLDS = 3

#Amount of features inputted in NN
NUMBER_FEATURES = 9

#State whether model should predict slope or single weeks
#Predicting the slope is making the assumption that the decrease is linear
PREDICT_SLOPE = False

#Gaussian Noise
USE_GAUSSIAN_ON_FVC = False 
VALUE_GAUSSIAN_NOISE_ON_FVC = 70 # Only needed when Gaussian noise = True

#Activation function to use
ACTIVATION_FUNCTION = 'relu'

#Train length
EPOCHS = 1000
STEPS_PER_EPOCH = 10

#Learning rate
MAX_LEARNING_RATE = 1e-4
COSINE_CYCLES = 10

MODEL_NAME = "10000 EPOCHS - 10 Cycles on FVC + datagenerator"

config = dict(NUMBER_FEATURES = NUMBER_FEATURES,
              ACTIVATION_FUNCTION = ACTIVATION_FUNCTION,
              EPOCHS = EPOCHS, STEPS_PER_EPOCH = STEPS_PER_EPOCH, MAX_LEARNING_RATE = MAX_LEARNING_RATE,
              COSINE_CYCLES = COSINE_CYCLES, MODEL_NAME=MODEL_NAME, USE_GAUSSIAN_ON_FVC=USE_GAUSSIAN_ON_FVC,
              VALUE_GAUSSIAN_NOISE_ON_FVC=VALUE_GAUSSIAN_NOISE_ON_FVC, PREDICT_SLOPE = PREDICT_SLOPE)

In [None]:
model = build_model(config)
#tf.keras.utils.plot_model(model)
model.summary()

## Folds and Training

In [None]:
fold_pos = get_fold_indices(FOLDS, train)

In [None]:
lr_cb = get_cosine_annealing_lr_callback(lr_max=config["MAX_LEARNING_RATE"], 
                                            n_epochs=config["EPOCHS"], 
                                            n_cycles=config["COSINE_CYCLES"])

In [None]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, list_IDs, config, number_of_labels = 3,
                 batch_size = 128, shuffle = True):
        self.number_features = int(config["NUMBER_FEATURES"])
        self.use_gaussian = config["USE_GAUSSIAN_ON_FVC"]
        self.gauss_std = config["VALUE_GAUSSIAN_NOISE_ON_FVC"]
        self.list_IDs = list_IDs
        self.batch_size = batch_size
        self.labels = labels
        self.shuffle = shuffle
        self.on_epoch_end()
        self.label_size = number_of_labels
    
    def __len__(self):
        return int(np.floor(len(self.list_IDs)/self.batch_size))
    
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.number_features))
        y = np.empty((self.batch_size, self.label_size), dtype=int)
        
        data = np.load("./train_data.npy", allow_pickle = True)
        lab = np.load("./train_labels.npy", allow_pickle = True)
        
        gauss = np.asarray(0)
        
        if self.use_gaussian:
            gauss = np.random.normal(0, self.gauss_std, size = self.batch_size)
        
        for i, ID in enumerate(list_IDs_temp):
            X[i,] = np.asarray(data[ID], dtype = "float32")
            y[i,] = np.asarray(lab[ID], dtype = "float32")
        
        X[:,1] += gauss.astype("float32")
        y = np.asarray(y,dtype = "float32")
        y[:,2] += gauss.astype("float32")
        
        return X, y

In [None]:
if DATA_GENERATOR:
    train_data = train[["Weeks", "FVC", "Percent", "Age", "Sex", 
                                 "Currently smokes", "Ex-smoker", "Never smoked", "Weekdiff_target"]]
    train_labels = labels
    np.save("train_data.npy", train_data.to_numpy())
    np.save("train_labels.npy", train_labels.to_numpy())

In [None]:
predictions = []

for fold in range(FOLDS):
    
    if DATA_GENERATOR:
        train_ID = list(range(fold_pos[0],fold_pos[fold])) + list(range(fold_pos[fold+1],len(train)))
        val_ID = list(range(fold_pos[fold], fold_pos[fold+1]))
        # Generators
        training_generator = DataGenerator(train_ID, config)
        validation_generator = DataGenerator(val_ID, config)
    else:
        x_train = data["input_features"][:fold_pos[fold]].append(data["input_features"][fold_pos[fold+1]:])
        y_train = labels[:fold_pos[fold]].append(labels[fold_pos[fold+1]:])
        x_val = data["input_features"][fold_pos[fold]:fold_pos[fold+1]]
        y_val = labels[fold_pos[fold]:fold_pos[fold+1]]
    
    model = build_model(config)
    
    sv = tf.keras.callbacks.ModelCheckpoint(
    'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
    callbacks = [lr_cb,sv]

    print(fold+1, "of", FOLDS)
    if WANDB:
        name = MODEL_NAME + '-F{}'.format(fold+1)
        config.update({'fold': fold+1})
        wandb.init(project="osic-fibrosis", name=name, config=config)
        wandb_cb = WandbCallback()
        callbacks.append(wandb_cb)
        
    if DATA_GENERATOR:
        history = model.fit(training_generator, validation_data = validation_generator, epochs = EPOCHS,
                            steps_per_epoch = STEPS_PER_EPOCH, verbose = 0, callbacks = callbacks)
    else:
        history = model.fit(x_train, y_train, validation_data = (x_val,y_val), epochs = EPOCHS,
                            steps_per_epoch = STEPS_PER_EPOCH, verbose = 0, callbacks = callbacks)

    if TEST or PSEUDO_TEST_PATIENTS > 0:
        model.load_weights('fold-%i.h5'%fold)
        predictions.append(model.predict(test_data, batch_size = 256))
    
    if WANDB:
        # finalize run
        wandb.join()

In [None]:
if TEST:
    if PREDICT_SLOPE:
        predictions = np.mean(predictions,axis = 0)
        for i in range(1,len(test_data)+1):
            submission.loc[i,"FVC"] = test_data.loc[i-1,"FVC"] + predictions[i-1,0]*test_data.loc[i-1,"Weekdiff_target"]
            submission.loc[i, "Confidence"] = abs(predictions[i-1,1]*test_data.loc[i-1,"Weekdiff_target"])
    else:
        predictions = np.abs(predictions)
        predictions[:,:,1] = np.power(predictions[:,:,1],2)
        predictions = np.mean(predictions, axis = 0)
        predictions[:,1] = np.power(predictions[:,1],0.5)
        for i in range(1,len(test_data)+1):
            submission.loc[i,"FVC"] = predictions[i-1,0]
            submission.loc[i, "Confidence"] = predictions[i-1,1]
    submission.to_csv("submission.csv", index = False)

In [None]:
if PSEUDO_TEST_PATIENTS > 0:
    predictions = np.abs(predictions)
    predictions[:,:,1] = np.power(predictions[:,:,1],2)
    predictions = np.mean(predictions, axis = 0)
    predictions[:,1] = np.power(predictions[:,1],0.5)
    FVC_true = test_check["TargetFVC"].values
    FVC_pred = predictions[:,0]
    sigma = predictions[:,1]
        
    sigma_clip = np.maximum(np.abs(sigma), 70)
    delta = np.abs(FVC_true - FVC_pred)
    delta = np.minimum(delta, 1000)
        
    sq2 = np.sqrt(2)
    loss = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip * sq2)
    
    print(np.mean(loss))
    