## Setup

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from tqdm import tqdm
import seaborn as sns
import wandb
from wandb.keras import WandbCallback
import keras
from keras.models import Sequential

from pfutils import (get_test_data, get_train_data, get_pseudo_test_data,
                     build_model, get_cosine_annealing_lr_callback, get_fold_indices)

WANDB = True
TEST = False
DATA_GENERATOR = True

#If TEST is False use this to simulate tractable testcases. Should be 0 if TEST = True
PSEUDO_TEST_PATIENTS = 0
if TEST:
    PSEUDO_TEST_PATIENTS = 0

In [None]:
wandb_key = "ea9b3c785541508ffdd795f2a706df065df389e3"

!pip install -q --upgrade wandb
!wandb login $wandb_key

## Settings And network

In [None]:
# Number of folds. A number between 1 and 176-PSEUDO_TEST_PATIENTS
FOLDS = 5

#Batch size
BATCH_SIZE = 128

#Amount of features inputted in NN
NUMBER_FEATURES = 9

#Hidden layers
HIDDEN_LAYERS = [64,32,16,8,4]

#State whether model should predict slope or single weeks
#Predicting the slope is making the assumption that the decrease is linear
PREDICT_SLOPE = False

#Gaussian Noise
USE_GAUSSIAN_ON_FVC = False 
VALUE_GAUSSIAN_NOISE_ON_FVC = 70 # Only needed when Gaussian noise = True
                                     
#Activation function to use ('swish' or 'relu')
ACTIVATION_FUNCTION = 'swish'

#Dropout rate
DROP_OUT_RATE = 0
DROP_OUT_LAYERS = [] # [0,1,2] voor dropout in de eerste 3 lagen

#Train length
EPOCHS = 250
STEPS_PER_EPOCH = 100

#L2-Regularization
L2_REGULARIZATION = False
REGULARIZATION_CONSTANT = 0.005

#Input and/or output normalization
INPUT_NORMALIZATION = True
OUTPUT_NORMALIZATION = True

#Learning rate
MAX_LEARNING_RATE = 5e-4
COSINE_CYCLES = 10

MODEL_NAME = "Baseline64"

config = dict(NUMBER_FEATURES = NUMBER_FEATURES, L2_REGULARIZATION = L2_REGULARIZATION, INPUT_NORMALIZATION =INPUT_NORMALIZATION,
              ACTIVATION_FUNCTION = ACTIVATION_FUNCTION, DROP_OUT_RATE = DROP_OUT_RATE, OUTPUT_NORMALIZATION = OUTPUT_NORMALIZATION,
              EPOCHS = EPOCHS, STEPS_PER_EPOCH = STEPS_PER_EPOCH, MAX_LEARNING_RATE = MAX_LEARNING_RATE,
              COSINE_CYCLES = COSINE_CYCLES, MODEL_NAME=MODEL_NAME, USE_GAUSSIAN_ON_FVC=USE_GAUSSIAN_ON_FVC,
              VALUE_GAUSSIAN_NOISE_ON_FVC=VALUE_GAUSSIAN_NOISE_ON_FVC, PREDICT_SLOPE = PREDICT_SLOPE,
              HIDDEN_LAYERS = HIDDEN_LAYERS, REGULARIZATION_CONSTANT = REGULARIZATION_CONSTANT,
              DROP_OUT_LAYERS = DROP_OUT_LAYERS, BATCH_SIZE = BATCH_SIZE)

In [None]:
if TEST:
    test_data, submission = get_test_data("../input/osic-pulmonary-fibrosis-progression/test.csv")
    
train, data, labels = get_train_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS, INPUT_NORMALIZATION)

if PSEUDO_TEST_PATIENTS > 0:
    test_data, test_check = get_pseudo_test_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS)

In [None]:
import math

def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

In [None]:
FOLDS = 3

node_size = [256,128,64,32,16,8,4]

for _ in range(200):
    HIDDEN_LAYERS = []
    for i in range(np.random.randint(1,10)):
        HIDDEN_LAYERS += [node_size[np.random.randint(7)]]

    #Gaussian Noise
    USE_GAUSSIAN_ON_FVC = np.random.rand() < 0.5
    VALUE_GAUSSIAN_NOISE_ON_FVC = np.random.randint(200) * USE_GAUSSIAN_ON_FVC # Only needed when Gaussian noise = True

    #Activation function to use ('swish' or 'relu')
    if np.random.rand() < 0.5:
        ACTIVATION_FUNCTION = 'swish'
    else:
        ACTIVATION_FUNCTION = 'relu'

    rates = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]    
    
    #Dropout rate
    DROP_OUT_RATE = rates[np.random.randint(1,8)] * (np.random.rand() < 0.3)
    DROP_OUT_LAYERS = []

    k = np.random.randint(len(HIDDEN_LAYERS))
    DROP_OUT_LAYERS = list(range(k))

    MODEL_NAME = "Random HL: " + str(HIDDEN_LAYERS) + ", GAUSS: " + str(VALUE_GAUSSIAN_NOISE_ON_FVC) + ", ACTI: " + ACTIVATION_FUNCTION + ", DROPOUTRATE: " + str(DROP_OUT_RATE) + ", DROP_OUT_LAYERS: " + str(DROP_OUT_LAYERS)

    config = dict(NUMBER_FEATURES = NUMBER_FEATURES, L2_REGULARIZATION = L2_REGULARIZATION, INPUT_NORMALIZATION =INPUT_NORMALIZATION,
                  ACTIVATION_FUNCTION = ACTIVATION_FUNCTION, DROP_OUT_RATE = DROP_OUT_RATE, OUTPUT_NORMALIZATION = OUTPUT_NORMALIZATION,
                  EPOCHS = EPOCHS, STEPS_PER_EPOCH = STEPS_PER_EPOCH, MAX_LEARNING_RATE = MAX_LEARNING_RATE,
                  COSINE_CYCLES = COSINE_CYCLES, MODEL_NAME=MODEL_NAME, USE_GAUSSIAN_ON_FVC=USE_GAUSSIAN_ON_FVC,
                  VALUE_GAUSSIAN_NOISE_ON_FVC=VALUE_GAUSSIAN_NOISE_ON_FVC, PREDICT_SLOPE = PREDICT_SLOPE,
                  HIDDEN_LAYERS = HIDDEN_LAYERS, LENGTH_HIDDEN_LAYERS = len(HIDDEN_LAYERS),REGULARIZATION_CONSTANT = REGULARIZATION_CONSTANT,
                  DROP_OUT_LAYERS = DROP_OUT_LAYERS, BATCH_SIZE = BATCH_SIZE, VALUE_GAUSS_IN_BATCH = roundup(VALUE_GAUSSIAN_NOISE_ON_FVC),
                 FIRST_HIDDEN_LAYERS = roundup(HIDDEN_LAYERS[0]), NUMBER_DROPOUT_LAYERS = len(DROP_OUT_LAYERS))

    lr_cb = get_cosine_annealing_lr_callback(lr_max=config["MAX_LEARNING_RATE"], 
                                            n_epochs=config["EPOCHS"], 
                                            n_cycles=config["COSINE_CYCLES"])
    
    fold_pos = get_fold_indices(FOLDS, train)
    
    class DataGenerator(keras.utils.Sequence):
        def __init__(self, list_IDs, config, validation = False, number_of_labels = 3,
                     batch_size = 128, shuffle = True):
            self.number_features = int(config["NUMBER_FEATURES"])
            self.use_gaussian = config["USE_GAUSSIAN_ON_FVC"]
            self.gauss_std = config["VALUE_GAUSSIAN_NOISE_ON_FVC"] and not validation
            self.list_IDs = list_IDs
            self.batch_size = config["BATCH_SIZE"]
            self.labels = labels
            self.shuffle = shuffle
            self.on_epoch_end()
            self.label_size = number_of_labels
            self.normalized = config["INPUT_NORMALIZATION"]

        def __len__(self):
            return int(np.floor(len(self.list_IDs)/self.batch_size))

        def __getitem__(self, index):
            'Generate one batch of data'
            # Generate indexes of the batch
            indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
            list_IDs_temp = [self.list_IDs[k] for k in indexes]
            # Generate data
            X, y = self.__data_generation(list_IDs_temp)
            return X, y

        def on_epoch_end(self):
            'Updates indexes after each epoch'
            self.indexes = np.arange(len(self.list_IDs))
            if self.shuffle == True:
                np.random.shuffle(self.indexes)

        def __data_generation(self, list_IDs_temp):
            'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
            # Initialization
            X = np.empty((self.batch_size, self.number_features))
            y = np.empty((self.batch_size, self.label_size), dtype=int)

            data = np.load("./train_data.npy", allow_pickle = True)
            lab = np.load("./train_labels.npy", allow_pickle = True)

            gauss = np.asarray(0)

            if self.use_gaussian:
                gauss = np.random.normal(0, self.gauss_std, size = self.batch_size)

            for i, ID in enumerate(list_IDs_temp):
                X[i,] = np.asarray(data[ID], dtype = "float32")
                y[i,] = np.asarray(lab[ID], dtype = "float32")

            if self.normalized:
                X[:,1] += gauss.astype("float32")/5800 
            else:
                X[:,1] += gauss.astype("float32")

            y = np.asarray(y,dtype = "float32")
            y[:,2] += gauss.astype("float32")

            return X, y
        
    if DATA_GENERATOR:
        train_data = train[["Weeks", "FVC", "Percent", "Age", "Sex", 
                                     "Currently smokes", "Ex-smoker", "Never smoked", "Weekdiff_target"]]
        train_labels = labels
        np.save("train_data.npy", train_data.to_numpy())
        np.save("train_labels.npy", train_labels.to_numpy())
        
    predictions = []

    for fold in range(FOLDS):
        if DATA_GENERATOR:
            train_ID = list(range(fold_pos[0],fold_pos[fold])) + list(range(fold_pos[fold+1],len(train)))
            val_ID = list(range(fold_pos[fold], fold_pos[fold+1]))
            # Generators
            training_generator = DataGenerator(train_ID, config)
            validation_generator = DataGenerator(val_ID, config, validation = True)
        else:
            x_train = data["input_features"][:fold_pos[fold]].append(data["input_features"][fold_pos[fold+1]:])
            y_train = labels[:fold_pos[fold]].append(labels[fold_pos[fold+1]:])
            x_val = data["input_features"][fold_pos[fold]:fold_pos[fold+1]]
            y_val = labels[fold_pos[fold]:fold_pos[fold+1]]
        
        model = build_model(config)
        
        sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
        save_weights_only=True, mode='min', save_freq='epoch')
        callbacks = [sv]

        print(fold+1, "of", FOLDS)
        if WANDB:
            name = MODEL_NAME + '-F{}'.format(fold+1)
            config.update({'fold': fold+1})
            wandb.init(project="pulfib", name=name, config=config)
            wandb_cb = WandbCallback()
            callbacks.append(wandb_cb)
            
        if DATA_GENERATOR:
            history = model.fit(training_generator, validation_data = validation_generator, epochs = EPOCHS,
                                verbose = 0, callbacks = callbacks)
        else:
            history = model.fit(x_train, y_train, validation_data = (x_val,y_val), epochs = EPOCHS,
                                steps_per_epoch = STEPS_PER_EPOCH, verbose = 2, callbacks = callbacks)

        if TEST or PSEUDO_TEST_PATIENTS > 0:
            model.load_weights('fold-%i.h5'%fold)
            predictions.append(model.predict(test_data, batch_size = 256))

        if WANDB:
            # finalize run
            wandb.join()