In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# Dataset
class Dataset:
    label_col_name = 'FVC'

    def __init__(self, data_list, patient_list, label_list=None, epoch=1, batch_size=10, root_dir=None):
        self.root_dir = root_dir
        
        # init variables
        self.data_list = data_list
        self.label_list = label_list if label_list is not None else np.ones(len(data_list))
        self.patient_list = patient_list

        # init dataset
        self.dataset = tf.data.Dataset.from_tensor_slices((self.data_list, self.label_list, np.arange(len(self.patient_list))))
        self.dataset = self.dataset.map(lambda data, label, index: tf.py_function(self.read_img, [data, label, index], [tf.float64, tf.float64, tf.int64]))
#         self.dataset = self.dataset.repeat(epoch)
        # self.dataset = self.dataset.shuffle(buffer_size=(int(len(data_list) * 0.4) + 3 * batch_size))
        self.dataset = self.dataset.batch(batch_size, drop_remainder=False)

    def __iter__(self):
        return self.dataset.__iter__()

    def read_img(self, data, label, index: tf.Tensor):
        print(self.root_dir)
        print(data.shape)
        img_path = os.path.join(self.root_dir, f'{self.patient_list[index]}.npy')
        img = np.load(img_path)
        img.resize((1, 38, 334, 334))

        return img, data, label


In [3]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten, Conv3D


class PFPModel(Model):
    def __init__(self):
        super().__init__()
        self.build_model()

    def build_model(self):
        self.conv1 = tf.keras.Sequential([
            Conv3D(filters=200, kernel_size=3, padding='same', activation='relu'),
            Conv3D(filters=100, kernel_size=3, padding='same', activation='relu'),
            Conv3D(filters=100, kernel_size=3, padding='same', activation='relu'),
            Conv3D(filters=50, kernel_size=3, padding='same', activation='relu'),
            Flatten(),
        ])

        self.fc = tf.keras.Sequential([
            Dense(500, activation='relu'),
            Dense(100, activation='relu'),
            Dense(1)
        ])

    def fit(self, dataset, epoch_num=100, print_epoch=10):
        # compile
        self.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MSE, metrics=['mse'])
        
        for epoch in range(epoch_num):
            for step, (img, x, y) in enumerate(dataset.dataset):
                y = tf.cast(y, tf.float32)
                with tf.GradientTape() as tape:
                    output = self.call((img, x))
                    loss = self.loss(output, y)
                    gradients = tape.gradient(loss, self.trainable_variables)
                    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
            
            if epoch % print_epoch == 0:
                print('Epoch:', epoch, 'Loss: ', loss.numpy().mean())

    def call(self, inputs, *args, **kwargs):
        """
        inputs: {imgs: [], info: []}
        """

        imgs = inputs[0]
        info = inputs[1]

        imgs = tf.cast(imgs.gpu(), tf.float32)
        info = tf.cast(info.gpu(), tf.float32)

        conv_out = self.conv1(imgs)

        info = tf.concat((conv_out, info), axis=1)
        out = self.fc(info)
        return out

In [5]:
# Init data path
root_dir = '~/Data/OSIC'
# osic-pulmonary-fibrosis-progression

# root_dir = '../input'
csv_dir = os.path.join(root_dir, 'osic-pulmonary-fibrosis-progression')
train_csv_path = os.path.join(csv_dir, 'train.csv')
test_csv_path = os.path.join(csv_dir, 'test.csv')
submission_csv_path = os.path.join(csv_dir, 'sample_submission.csv')


In [111]:
# Read CSV
train_data = pd.read_csv(train_csv_path)
test_data = pd.read_csv(test_csv_path)
submission = pd.read_csv(submission_csv_path)
train_data.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [112]:
# Preprocessing
# max-fvc
train_data['MaxFvc'] = train_data['FVC'] / train_data['Percent'] * 100
train_data.tail()

# min-max (all column)


Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,MaxFvc
1544,ID00426637202313170790466,13,2712,66.594637,73,Male,Never smoked,4072.4
1545,ID00426637202313170790466,19,2978,73.126412,73,Male,Never smoked,4072.4
1546,ID00426637202313170790466,31,2908,71.407524,73,Male,Never smoked,4072.4
1547,ID00426637202313170790466,43,2975,73.052745,73,Male,Never smoked,4072.4
1548,ID00426637202313170790466,59,2774,68.117081,73,Male,Never smoked,4072.4


In [113]:
label_list = train_data['FVC'].to_numpy()
patient_list = train_data['Patient'].to_numpy()
train_data.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,MaxFvc
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,3974.0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,3974.0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,3974.0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,3974.0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,3974.0


In [114]:
# create test data
# test_data_input = pd.DataFrame()
def create_test_input(x):
    patient, weeks = x.split('_')
    weeks = int(weeks)
    
#     percent, age, sex, smokingStatus = train_data[(train_data['Patient'] == patient) & (train_data['Weeks'] == week)][['Perc
    patient_info_list = train_data[train_data['Patient'] == patient]
    patient_info = patient_info_list[patient_info_list['Weeks'] == weeks]
    if len(patient_info) == 0:
        patient_info = patient_info_list.iloc[0].copy()
        patient_info.update({'Weeks': weeks, 'FVC': 'None'})
    else:
        patient_info = patient_info.iloc[0]
        
    return patient_info
    
    
    

test_data_input = submission['Patient_Week'].apply(lambda x: create_test_input(x))
test_data_input.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,MaxFvc
0,ID00419637202311204720264,-12,,70.186855,73,Male,Ex-smoker,4302.8
1,ID00421637202311550012437,-12,,82.045291,68,Male,Ex-smoker,3338.4
2,ID00422637202311677017371,-12,,76.672493,73,Male,Ex-smoker,2517.2
3,ID00423637202312137826377,-12,,79.258903,72,Male,Ex-smoker,4156.0
4,ID00426637202313170790466,-12,,71.824968,73,Male,Never smoked,4072.4


In [115]:
data_list = train_data.drop(['FVC', 'Patient', 'Percent'], axis=1)
data_list = pd.get_dummies(data_list)

In [28]:
# create dataset
img_dir = os.path.join(root_dir, 'osic-processed-image-saved-to-npy')
dataset = Dataset(data_list, patient_list, label_list=label_list, batch_size=20, root_dir=img_dir)

# load model
model = PFPModel()
model.fit(dataset, epoch_num=1, print_epoch=10)

Epoch: 0 Loss:  233239.06


In [30]:
model.save_weights('./pfp_model')

In [116]:
# Set Test Dataset
test_label_list = test_data_input['FVC']
test_patient_list = test_data_input['Patient']
test_data_list = test_data_input.drop(['FVC', 'Patient', 'Percent'], axis=1)
test_data_list = pd.get_dummies(test_data_list)

test_dataset = Dataset(test_data_list, test_patient_list)

730


In [120]:
# Inference
# for y in test_dataset:
#     print(y)
for test_data in zip(test_data_list.to_numpy(), test_patient_list):
    info = test_data[0]
    patient_id = test_data[1]
    
    img_path = os.path.join(self.root_dir, f'{patient_id}.npy')
    img = np.load(img_path)
    img.resize((1, 38, 334, 334))
    img

(array([-1.2000e+01,  7.3000e+01,  4.3028e+03,  1.0000e+00,  1.0000e+00,
        0.0000e+00]), 'ID00419637202311204720264')
(array([-1.2000e+01,  6.8000e+01,  3.3384e+03,  1.0000e+00,  1.0000e+00,
        0.0000e+00]), 'ID00421637202311550012437')
(array([-1.2000e+01,  7.3000e+01,  2.5172e+03,  1.0000e+00,  1.0000e+00,
        0.0000e+00]), 'ID00422637202311677017371')
(array([-1.200e+01,  7.200e+01,  4.156e+03,  1.000e+00,  1.000e+00,
        0.000e+00]), 'ID00423637202312137826377')
(array([-1.2000e+01,  7.3000e+01,  4.0724e+03,  1.0000e+00,  0.0000e+00,
        1.0000e+00]), 'ID00426637202313170790466')
(array([-1.1000e+01,  7.3000e+01,  4.3028e+03,  1.0000e+00,  1.0000e+00,
        0.0000e+00]), 'ID00419637202311204720264')
(array([-1.1000e+01,  6.8000e+01,  3.3384e+03,  1.0000e+00,  1.0000e+00,
        0.0000e+00]), 'ID00421637202311550012437')
(array([-1.1000e+01,  7.3000e+01,  2.5172e+03,  1.0000e+00,  1.0000e+00,
        0.0000e+00]), 'ID00422637202311677017371')
(array([-1.100

(array([9.300e+01, 7.200e+01, 4.156e+03, 1.000e+00, 1.000e+00, 0.000e+00]), 'ID00423637202312137826377')
(array([9.3000e+01, 7.3000e+01, 4.0724e+03, 1.0000e+00, 0.0000e+00,
       1.0000e+00]), 'ID00426637202313170790466')
(array([9.4000e+01, 7.3000e+01, 4.3028e+03, 1.0000e+00, 1.0000e+00,
       0.0000e+00]), 'ID00419637202311204720264')
(array([9.4000e+01, 6.8000e+01, 3.3384e+03, 1.0000e+00, 1.0000e+00,
       0.0000e+00]), 'ID00421637202311550012437')
(array([9.4000e+01, 7.3000e+01, 2.5172e+03, 1.0000e+00, 1.0000e+00,
       0.0000e+00]), 'ID00422637202311677017371')
(array([9.400e+01, 7.200e+01, 4.156e+03, 1.000e+00, 1.000e+00, 0.000e+00]), 'ID00423637202312137826377')
(array([9.4000e+01, 7.3000e+01, 4.0724e+03, 1.0000e+00, 0.0000e+00,
       1.0000e+00]), 'ID00426637202313170790466')
(array([9.5000e+01, 7.3000e+01, 4.3028e+03, 1.0000e+00, 1.0000e+00,
       0.0000e+00]), 'ID00419637202311204720264')
(array([9.5000e+01, 6.8000e+01, 3.3384e+03, 1.0000e+00, 1.0000e+00,
       0.0000e

In [None]:
# Apply real data
submission = pd.DataFrame([])
for patient in test_data['Patient']:
    patient_info_list = train_data[train_data['Patient'] == patient][['Patient', 'Weeks','FVC']]
#     print(patient_info_list)
#     for 
#     result_list.append(patient_result)
    
# result_list