## Setup

In [101]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from tqdm import tqdm
import seaborn as sns
import wandb
from wandb.keras import WandbCallback
import keras
from keras.models import Sequential

from pfutils import (get_test_data, get_train_data, get_pseudo_test_data,
                     build_model, get_cosine_annealing_lr_callback, get_fold_indices)

WANDB = True
SUBMIT = True
DATA_GENERATOR = True
TRAIN_ON_BACKWARD_WEEKS = False

#If TEST is False use this to simulate tractable testcases. Should be 0 if SUBMIT = True
PSEUDO_TEST_PATIENTS = 0

In [102]:
if SUBMIT:
    PSEUDO_TEST_PATIENTS = 0

In [103]:
# retrieve W&B key
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("wandb_key")
assert wandb_key, "Please create a key.txt or Kaggle Secret with your W&B API key"

#wandb_key = "24020b558f39257d30a084a55cb438922c321495"

!pip install -q --upgrade wandb
!wandb login $wandb_key

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


## Settings And network

In [105]:
# Number of folds. A number between 1 and 176-PSEUDO_TEST_PATIENTS
FOLDS = 10

#Batch size
BATCH_SIZE = 128

#Amount of features inputted in NN
NUMBER_FEATURES = 9

#Hidden layers
HIDDEN_LAYERS = [64,64]

#State whether model should predict slope or single weeks
#Predicting the slope is making the assumption that the decrease is linear
PREDICT_SLOPE = False

#Gaussian Noise (the reported std error for measurement devices is 70)
#GAUSS_ALLIGNED is a boolean indicating if the gaussians added to X and y are perfectly correlated or independent
VALUE_GAUSSIAN_NOISE_ON_FVC = 70
GAUSSIAN_NOISE_CORRELATED = False
                                     
#Activation function to use ('swish' or 'relu')
ACTIVATION_FUNCTION = 'swish'

#Dropout rate
DROP_OUT_RATE = 0
DROP_OUT_LAYERS = [] # [0,1,2] voor dropout in de eerste 3 lagen

#Train length
EPOCHS = 100

#L2-Regularization
L2_REGULARIZATION = False
REGULARIZATION_CONSTANT = 0.005

#Input and/or output normalization
INPUT_NORMALIZATION = True
OUTPUT_NORMALIZATION = True

#Learning rate
MAX_LEARNING_RATE = 5e-4
COSINE_CYCLES = 10

MODEL_NAME = "Baseline" 

config = dict(NUMBER_FEATURES = NUMBER_FEATURES, L2_REGULARIZATION = L2_REGULARIZATION, INPUT_NORMALIZATION = INPUT_NORMALIZATION,
              ACTIVATION_FUNCTION = ACTIVATION_FUNCTION, DROP_OUT_RATE = DROP_OUT_RATE, OUTPUT_NORMALIZATION = OUTPUT_NORMALIZATION,
              EPOCHS = EPOCHS, MAX_LEARNING_RATE = MAX_LEARNING_RATE,
              COSINE_CYCLES = COSINE_CYCLES, MODEL_NAME=MODEL_NAME,
              VALUE_GAUSSIAN_NOISE_ON_FVC=VALUE_GAUSSIAN_NOISE_ON_FVC, PREDICT_SLOPE = PREDICT_SLOPE,
              HIDDEN_LAYERS = HIDDEN_LAYERS, REGULARIZATION_CONSTANT = REGULARIZATION_CONSTANT,
              DROP_OUT_LAYERS = DROP_OUT_LAYERS, BATCH_SIZE = BATCH_SIZE, GAUSSIAN_NOISE_CORRELATED = GAUSSIAN_NOISE_CORRELATED )

In [107]:
if SUBMIT:
    test_data, submission = get_test_data("../input/osic-pulmonary-fibrosis-progression/test.csv", INPUT_NORMALIZATION)
    
train, data, labels = get_train_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS, INPUT_NORMALIZATION, TRAIN_ON_BACKWARD_WEEKS)

if PSEUDO_TEST_PATIENTS > 0:
    test_data, test_check = get_pseudo_test_data('../input/osic-pulmonary-fibrosis-progression/train.csv', PSEUDO_TEST_PATIENTS, INPUT_NORMALIZATION)

In [108]:
model = build_model(config)
#tf.keras.utils.plot_model(model)
model.summary()

Model: "functional_37"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_features (InputLayer)     [(None, 9)]          0                                            
__________________________________________________________________________________________________
dense_36 (Dense)                (None, 64)           640         input_features[0][0]             
__________________________________________________________________________________________________
dense_37 (Dense)                (None, 64)           4160        dense_36[0][0]                   
__________________________________________________________________________________________________
FVC_output (Dense)              (None, 1)            65          dense_37[0][0]                   
______________________________________________________________________________________

## Folds and Training

In [109]:
fold_pos = get_fold_indices(FOLDS, train)
print(fold_pos)

[0, 2009, 4067, 6072]


In [110]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, list_IDs, config, validation = False, number_of_labels = 3,
                 batch_size = 128, shuffle = True):
        self.number_features = int(config["NUMBER_FEATURES"])
        self.validation = validation
        self.gauss_std = config["VALUE_GAUSSIAN_NOISE_ON_FVC"]
        self.list_IDs = list_IDs
        self.batch_size = config["BATCH_SIZE"]
        self.labels = labels
        self.shuffle = shuffle
        self.on_epoch_end()
        self.label_size = number_of_labels
        self.normalized = config["INPUT_NORMALIZATION"]
        self.correlated = config["GAUSSIAN_NOISE_CORRELATED"]
    
    def __len__(self):
        return int(np.floor(len(self.list_IDs)/self.batch_size))
    
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.number_features))
        y = np.empty((self.batch_size, self.label_size), dtype=int)
        
        data = np.load("./train_data.npy", allow_pickle = True)
        lab = np.load("./train_labels.npy", allow_pickle = True)
        
        for i, ID in enumerate(list_IDs_temp):
            X[i,] = np.asarray(data[ID], dtype = "float32")
            y[i,] = np.asarray(lab[ID], dtype = "float32")
        y = np.asarray(y,dtype = "float32")
        
        if not self.validation:
            gauss_X = np.random.normal(0, self.gauss_std, size = self.batch_size)

            if self.correlated:
                gauss_y = gauss_X
            else:
                gauss_y = np.random.normal(0, self.gauss_std, size = self.batch_size)
            if self.normalized:
                gauss_X = gauss_X/5000 

            X[:,2] += gauss_X.astype("float32")*X[:,2]/X[:,1]
            X[:,1] += gauss_X.astype("float32")
            y[:,2] += gauss_X.astype("float32")
            y[:,0] += gauss_y.astype("float32")
        
        return X, y

In [111]:
if DATA_GENERATOR:
    train_data = train[["Weeks", "FVC", "Percent", "Age", "Sex", 
                                 "Currently smokes", "Ex-smoker", "Never smoked", "Weekdiff_target"]]
    train_labels = labels
    np.save("train_data.npy", train_data.to_numpy())
    np.save("train_labels.npy", train_labels.to_numpy())

In [112]:
predictions = []

for fold in range(FOLDS):
    if DATA_GENERATOR:
        train_ID = list(range(fold_pos[0],fold_pos[fold])) + list(range(fold_pos[fold+1],len(train)))
        val_ID = list(range(fold_pos[fold], fold_pos[fold+1]))
        # Generators
        training_generator = DataGenerator(train_ID, config)
        validation_generator = DataGenerator(val_ID, config, validation = True)
    else:
        x_train = data["input_features"][:fold_pos[fold]].append(data["input_features"][fold_pos[fold+1]:])
        y_train = labels[:fold_pos[fold]].append(labels[fold_pos[fold+1]:])
        x_val = data["input_features"][fold_pos[fold]:fold_pos[fold+1]]
        y_val = labels[fold_pos[fold]:fold_pos[fold+1]]
    
    model = build_model(config)
    
    sv = tf.keras.callbacks.ModelCheckpoint(
    'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
    callbacks = [sv]

    print(fold+1, "of", FOLDS)
    if WANDB:
        name = MODEL_NAME + '-F{}'.format(fold+1)
        config.update({'fold': fold+1})
        wandb.init(project="pulfib", name=name, config=config)
        wandb_cb = WandbCallback()
        callbacks.append(wandb_cb)
        
    if DATA_GENERATOR:
        history = model.fit(training_generator, validation_data = validation_generator, epochs = EPOCHS,
                            verbose = 0, callbacks = callbacks)
    else:
        history = model.fit(x_train, y_train, validation_data = (x_val,y_val), epochs = EPOCHS, verbose = 0, callbacks = callbacks)

    if SUBMIT or PSEUDO_TEST_PATIENTS > 0:
        model.load_weights('fold-%i.h5'%fold)
        predictions.append(model.predict(test_data, batch_size = 256))
    
    if WANDB:
        # finalize run
        wandb.join()

1 of 3


[34m[1mwandb[0m: Wandb version 0.9.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


2 of 3


[34m[1mwandb[0m: Wandb version 0.9.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


3 of 3


[34m[1mwandb[0m: Wandb version 0.9.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [113]:
if SUBMIT:
    if PREDICT_SLOPE:
        predictions = np.mean(predictions,axis = 0)
        for i in range(1,len(test_data)+1):
            submission.loc[i,"FVC"] = test_data.loc[i-1,"FVC"] + predictions[i-1,0]*test_data.loc[i-1,"Weekdiff_target"]
            submission.loc[i, "Confidence"] = abs(predictions[i-1,1]*test_data.loc[i-1,"Weekdiff_target"])
    else:
        predictions = np.abs(predictions)
        predictions[:,:,1] = np.power(predictions[:,:,1],2)
        predictions = np.mean(predictions, axis = 0)
        predictions[:,1] = np.power(predictions[:,1],0.5)
        for i in range(1,len(test_data)+1):
            submission.loc[i,"FVC"] = predictions[i-1,0]
            submission.loc[i, "Confidence"] = predictions[i-1,1]
    submission.to_csv("submission.csv", index = False)

In [114]:
backup = predictions

In [115]:
import matplotlib.pyplot as plt
from scipy.stats import gmean
if PSEUDO_TEST_PATIENTS > 0:
    result = []
    for i in range(-20,20):
        predictions = np.abs(backup)
        if i == 0:
            predictions[:,:,1] = gmean(predictions[:,:,1], axis = 0)
            predictions = np.mean(predictions, axis = 0)
        else:
            predictions[:,:,1] = np.power(predictions[:,:,1],i)
            predictions = np.mean(predictions, axis = 0)
            predictions[:,1] = np.power(predictions[:,1],1/i)
        FVC_true = test_check["TargetFVC"].values
        FVC_pred = predictions[:,0]
        sigma = predictions[:,1]

        sigma_clip = np.maximum(np.abs(sigma), 70)
        delta = np.abs(FVC_true - FVC_pred)
        delta = np.minimum(delta, 1000)

        sq2 = np.sqrt(2)
        loss = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip * sq2)
        result.append(np.mean(loss))
    plt.plot(np.arange(-20,20),result)
