In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ventilator-pressure-prediction/sample_submission.csv
/kaggle/input/ventilator-pressure-prediction/train.csv
/kaggle/input/ventilator-pressure-prediction/test.csv
/kaggle/input/rnn-64-128/ventilator_extraFeatures_biDirectional1_trial.h5


In [2]:
##Analyze the ventillator pressure dataset
import pandas as pd
import numpy as np
##Create a NN model
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, Bidirectional
from tensorflow.keras.callbacks import CSVLogger, TensorBoard, ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.preprocessing import timeseries_dataset_from_array
import gc


trainDataPath = "/kaggle/input/ventilator-pressure-prediction/train.csv"
df_data = pd.read_csv(trainDataPath, header=0)
df_data_orig = df_data.copy()
df_data.head()
#df_data.describe()

# testDataPath = "/kaggle/input/ventilator-pressure-prediction/test.csv"
# df_test = pd.read_csv(testDataPath, header=0)

#df_test.head()

DEBUG= False
if DEBUG:
    df_data = df_data.iloc[:80*100, :]
    #df_test = df_test.iloc[:80*100, :]
print(df_data.shape)
#print(df_test.shape)

(6036000, 8)


In [3]:
##Preprocess the data for the model
def addFeatures(df_data):
    df_data["u_in_prev1"] = df_data.groupby("breath_id")["u_in"].shift(1, fill_value=0)
    df_data["u_in_prev2"] = df_data.groupby("breath_id")["u_in"].shift(2, fill_value=0)
    df_data["u_in_prev3"] = df_data.groupby("breath_id")["u_in"].shift(3, fill_value=0)
    df_data["u_in_prev4"] = df_data.groupby("breath_id")["u_in"].shift(4, fill_value=0)
    df_data["u_in_prev5"] = df_data.groupby("breath_id")["u_in"].shift(5, fill_value=0)
    df_data["u_in_prev6"] = df_data.groupby("breath_id")["u_in"].shift(6, fill_value=0)
    df_data["u_in_next1"] = df_data.groupby("breath_id")["u_in"].shift(-1, fill_value=0)
    df_data["u_in_next2"] = df_data.groupby("breath_id")["u_in"].shift(-2, fill_value=0)
    df_data["u_in_next3"] = df_data.groupby("breath_id")["u_in"].shift(-3, fill_value=0)
    df_data["u_in_next4"] = df_data.groupby("breath_id")["u_in"].shift(-4, fill_value=0)
    df_data["u_in_next5"] = df_data.groupby("breath_id")["u_in"].shift(-5, fill_value=0)
    df_data["u_in_next6"] = df_data.groupby("breath_id")["u_in"].shift(-6, fill_value=0)
    df_data["u_in_cumm"] = df_data.groupby("breath_id")["u_in"].cumsum()
    df_data["area"] = df_data["time_step"]*df_data["u_in"]
    df_data["area"] = df_data.groupby("breath_id")["area"].cumsum()
    df_data["u_in_diff1"] = df_data["u_in"] - df_data["u_in_prev1"]
    df_data["u_in_diff2"] = df_data["u_in"] - df_data["u_in_prev2"]
    df_data["u_in_diff3"] = df_data["u_in"] - df_data["u_in_prev3"]
    df_data["u_in_diff4"] = df_data["u_in"] - df_data["u_in_prev4"]
    df_data["u_in_diff5"] = df_data["u_in"] - df_data["u_in_prev5"]
    df_data["u_in_diff6"] = df_data["u_in"] - df_data["u_in_prev6"]

    df_data["u_out_prev1"] = df_data.groupby("breath_id")["u_out"].shift(1, fill_value=0)
    df_data["u_out_prev2"] = df_data.groupby("breath_id")["u_out"].shift(2, fill_value=0)
    df_data["u_out_prev3"] = df_data.groupby("breath_id")["u_out"].shift(3, fill_value=0)
    df_data["u_out_prev4"] = df_data.groupby("breath_id")["u_out"].shift(4, fill_value=0)
    df_data["u_out_prev5"] = df_data.groupby("breath_id")["u_out"].shift(5, fill_value=0)
    df_data["u_out_prev6"] = df_data.groupby("breath_id")["u_out"].shift(6, fill_value=0)
    df_data["u_out_next1"] = df_data.groupby("breath_id")["u_out"].shift(-1, fill_value=0)
    df_data["u_out_next2"] = df_data.groupby("breath_id")["u_out"].shift(-2, fill_value=0)
    df_data["u_out_next3"] = df_data.groupby("breath_id")["u_out"].shift(-3, fill_value=0)
    df_data["u_out_next4"] = df_data.groupby("breath_id")["u_out"].shift(-4, fill_value=0)
    df_data["u_out_next5"] = df_data.groupby("breath_id")["u_out"].shift(-5, fill_value=0)
    df_data["u_out_next6"] = df_data.groupby("breath_id")["u_out"].shift(-6, fill_value=0)
    df_data["u_out_diff1"] = df_data["u_out"] - df_data["u_out_prev1"]
    df_data["u_out_diff2"] = df_data["u_out"] - df_data["u_out_prev2"]
    df_data["u_out_diff3"] = df_data["u_out"] - df_data["u_out_prev3"]
    df_data["u_out_diff4"] = df_data["u_out"] - df_data["u_out_prev4"]
    df_data["u_out_diff5"] = df_data["u_out"] - df_data["u_out_prev5"]
    df_data["u_out_diff6"] = df_data["u_out"] - df_data["u_out_prev6"]

    df_data["u_in_diff_max"] = df_data.groupby("breath_id")["u_in"].transform("max") - df_data["u_in"]
    df_data["u_in_diff_mean"] = df_data.groupby("breath_id")["u_in"].transform("mean") - df_data["u_in"]

    df_data["u_in_u_out"] = df_data["u_in"] * df_data["u_out"]
    df_data["u_out_timestep"] = df_data["u_out"] * df_data["time_step"]
    
    df_data['R_div_C'] = df_data["R"].div(df_data["C"])
    df_data['R'] = df_data['R'].astype(str)
    df_data['C'] = df_data['C'].astype(str)
    df_data['R__C'] = df_data["R"].astype(str) + '__' + df_data["C"].astype(str)
    df_data = pd.get_dummies(df_data)
    
    df_data['time_step_cumsum'] = df_data.groupby(['breath_id'])['time_step'].cumsum()
    df_data["ewm_u_in_mean"] = df_data.groupby('breath_id')['u_in'].ewm(halflife=9)\
    .mean().reset_index(level=0,drop=True)
    df_data["ewm_u_in_std"] = df_data.groupby('breath_id')['u_in'].ewm(halflife=10)\
    .std().reset_index(level=0,drop=True)
    df_data["ewm_u_in_corr"] = df_data.groupby('breath_id')['u_in'].ewm(halflife=15)\
    .corr().reset_index(level=0,drop=True)
    df_data[["15_in_sum","15_in_min","15_in_max","15_in_mean","15_out_std"]]=df_data.groupby('breath_id')['u_in']\
    .rolling(window=15,min_periods=1)\
    .agg({"15_in_sum":"sum","15_in_min":"min","15_in_max":"max","15_in_mean":"mean","15_in_std":"std"})\
    .reset_index(level=0,drop=True)
    df_data.fillna(0, inplace=True)
    return df_data


In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
def searchNearest(prediction):
    idx = np.searchsorted(df_data_y_sort, prediction)
    #print("prediction= %s, idx= %s" %(prediction,  idx))
    if (idx >= df_data_y_sort.shape[0]-1):
        ## If the number is greater than the largest value in sort array, return the largest element
        return df_data_y_sort[-1]
    
    lowerVal = df_data_y_sort[idx]
    upperVal = df_data_y_sort[idx+1]
    return lowerVal if np.abs(lowerVal - prediction) < np.abs(upperVal - prediction) else upperVal


In [6]:
##Added features for data and test
df_data = addFeatures(df_data)
#df_test = addFeatures(df_test)
print(df_data.shape)
#print(df_test.shape)

##Reduce memory data occupied by changing the datatypes
df_data = reduce_mem_usage(df_data)
#print(df_data.dtypes)
#df_test = reduce_mem_usage(df_test)
#df_test.dtypes

df_y_data = df_data["pressure"].copy()
df_data_y_sort = np.sort(df_y_data.unique())

targets = df_y_data.to_numpy().reshape(-1, 80)
df_data = df_data.drop(["id", "breath_id", "pressure"], axis=1)

#df_test = df_test.drop(["id", "breath_id"], axis=1)
#print("data shape= %s, Test shape= %s" %(df_data.shape, df_test.shape))

##Normalize the data
from sklearn.preprocessing import RobustScaler, normalize
RS = RobustScaler()
train = RS.fit_transform(df_data)
#test = RS.transform(df_test)
print(train[0,:])
#print(test[0,:])

##Change shape for RNN runs
train = train.reshape(-1, 80, train.shape[-1])
#test = test.reshape(-1, 80, train.shape[-1])
train.shape

(6036000, 73)
Mem. usage decreased from 2757.31 Mb to 1093.71 Mb (60.3% reduction)
[-9.8905218e-01 -9.3738443e-01 -1.0000000e+00 -8.6412060e-01
 -8.4252667e-01 -8.1925792e-01 -7.9415160e-01 -7.6915002e-01
 -7.3844558e-01  2.8902071e+00  3.6640253e+00  3.7439151e+00
  4.2778239e+00  4.6832628e+00  4.6862230e+00 -7.2522783e-01
 -5.1658052e-01  4.7140878e-01  2.1799320e-01  1.2102799e-01
  6.8314672e-02  2.3523314e-02  3.1109562e-03 -1.0000000e+00
 -1.0000000e+00 -1.0000000e+00 -1.0000000e+00 -1.0000000e+00
 -1.0000000e+00 -1.0000000e+00 -1.0000000e+00 -1.0000000e+00
 -1.0000000e+00 -1.0000000e+00 -1.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  3.2694092e-01  1.2738636e+00  0.0000000e+00
 -6.6554129e-01 -2.8571430e-01  1.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00
  0.0000000e+00  0.0000000e+00  1.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  

(75450, 80, 70)

In [7]:
##Load model
##Clear models
tf.keras.backend.clear_session()
model = load_model("/kaggle/input/rnn-64-128/ventilator_extraFeatures_biDirectional1_trial.h5")
model.summary()

2021-10-26 04:26:26.904009: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-26 04:26:26.995344: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-26 04:26:26.996086: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-26 04:26:26.997639: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 80, 2048)          8970240   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 80, 1024)          10489856  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 80, 512)           2623488   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 80, 256)           656384    
_________________________________________________________________
dense (Dense)                (None, 80, 128)           32896     
_________________________________________________________________
dense_1 (Dense)              (None, 80, 1)             129       
Total params: 22,772,993
Trainable params: 22,772,993
Non-trainable params: 0
____________________________________________

In [8]:
# print(train.shape[2])
# print(type(train))
# new_train = train.reshape(-1, int(train.shape[2]))
# print(train[0][0])
# print(new_train[0])

# pred = model.predict(train, batch_size=32)
# pred = pred.reshape(-1, int(pred.shape[2]))
# type(pred)

In [9]:
##Error analysis files
##Predict the train pressures
df_y_data = df_y_data.to_frame()
trainPred = model.predict(train, batch_size=32)
df_y_data["pressPredict"] = trainPred.reshape(-1, int(trainPred.shape[2]))
df_y_data["nearestPredict"] = df_y_data["pressPredict"].apply(searchNearest)

##Load original data
trainDataPath = "/kaggle/input/ventilator-pressure-prediction/train.csv"
df_data = pd.read_csv(trainDataPath, header=0)

df_data.drop("pressure", axis=1, inplace=True) ##Drop pressure, since df_y_data has pressure

##SaveName
saveName = "ventilator_extraFeatures_biDirectional1_trial"

##Concat data
df_data = pd.concat([df_data, df_y_data], axis=1, ignore_index=True)
df_data.to_csv("./" + saveName + "_trainErrorAnalysis.csv", index=False)

##Concat even the additional features
df_data = pd.concat([df_data, pd.DataFrame(data=train)], axis=1, ignore_index=True)
df_data.to_csv("./" + saveName + "_trainErrorAnalysis_allFeatures.csv", index=False)

2021-10-26 04:26:38.261796: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1690080000 exceeds 10% of free system memory.
2021-10-26 04:26:39.951246: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1690080000 exceeds 10% of free system memory.
2021-10-26 04:26:41.173082: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-26 04:26:43.872564: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


ValueError: Must pass 2-d input. shape=(75450, 80, 70)

In [None]:
# ##Compare actual pressure and predicted pressures
# ##Visualize some data
# import matplotlib.pyplot as plt
# rowsToViz = 80*5
# vizData = df_data.iloc[:rowsToViz,:]
# pressData = df_y_data.iloc[:rowsToViz, :]
# vizData.head()

# plt.figure(figsize=(30, 6))
# plt.plot(vizData["R_div_C"], label='R_div_C')
# #plt.plot(vizData["C"], label="C")
# plt.plot(vizData["u_in"], label="u_in")
# plt.plot(vizData["u_out"], label="u_out")
# plt.plot(pressData["pressure"], label="pressure")
# plt.plot(pressData["pressPredict"], label="pressPredict")
# plt.legend(loc='upper right')
# plt.show()