In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets 
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Bidirectional,TimeDistributed 
from keras.layers import Dropout
import matplotlib.pyplot as plt
from keras.optimizers import SGD

## functions used

In [2]:
def show_history(history):
    print(history.history.keys())
    fig = plt.figure(figsize=(20,5))
    plt.subplot(121)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.subplot(122)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='lower left')
    plt.show()

### calculating sofa, qsofa and sirs

In [3]:
def get_scores(d1):
    SF = list()
    qSF = list()
    Sirs = list()
    for row in range(1,d1.shape[0]+1):
        SOFA_score = 0

        count = 0
        #calculate sofa
        if d1['PaCO2'][row-1]!=np.NaN and d1['FiO2'][row-1]!=np.NaN :
            ratio = 713-((5/4)*(d1['PaCO2'][row-1]/d1['FiO2'][row-1]))
        else:
            ratio = np.NaN
        Platelets = d1['Platelets'][row-1]
        Bilirubin = d1['Bilirubin_total'][row-1]
        Creatinine = d1['Creatinine'][row-1]

        MAP = d1['MAP'][row-1]

        if MAP<70:
            SOFA_score+=1

        if ratio<100:
            SOFA_score += 1
        if ratio<200:
            SOFA_score += 1
        if ratio<300:
            SOFA_score += 1
        if ratio<400:
            SOFA_score += 1

        if Bilirubin>1.2:
            SOFA_score+=1
        if Bilirubin>2:
            SOFA_score+=1
        if Bilirubin>6:
            SOFA_score+=1
        if Bilirubin>12:
            SOFA_score+=1


        if Platelets<20:
            SOFA_score+=1
        if Platelets<50:
            SOFA_score+=1
        if Platelets<100:
            SOFA_score+=1
        if Platelets<150:
            SOFA_score+=1

        if Creatinine>1.2:
            SOFA_score+=1
        if Creatinine>2:
            SOFA_score+=1
        if Creatinine>3.5:
            SOFA_score+=1
        if Creatinine>5:
            SOFA_score+=1

        # calculate qSOFA
        if d1['Resp'][row-1]>22 or d1['SBP'][row-1]<100:
            qSOFA = 1
        else:  qSOFA = 0

        # calculate SIRS
        if d1['Temp'][row-1]>38 or d1['Temp'][row-1] <36:
            count+=1
        if d1['Resp'][row-1]>20:
            count+=1
        if d1['HR'][row-1]>90:
            count+=1
        if d1['WBC'][row-1]>12 or d1['WBC'][row-1] <4:
            count+=1

        if count>1:
            sirs = 1
        else: sirs = 0

        SF.append(SOFA_score)
        qSF.append(qSOFA)
        Sirs.append(sirs)
    return (SF,qSF,Sirs)

In [4]:
def fill_missing_values(d1):  # to be changed by MICE (imputation method)
    for col in d1.columns:
        d1[col].fillna( method ='ffill', inplace = True)
        
    for col in d1.columns:
        d1[col].fillna(d1[col].mean(), inplace = True)
    
    return d1

def to_str(i):
    st = str(i)
    return (5-len(st))*'0' + st

def process_patient_data(df):
    df = fill_missing_values(df)
    SF, qSF, Sirs = get_scores(df)
    df.insert(41, 'Sirs', Sirs)
    df.insert(41, 'qSOFA',qSF)
    df.insert(41, 'SOFA_score',SF)
    
    return df


Unnamed: 0.1,Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SOFA_score,qSOFA,Sirs,SepsisLabel
0,1,102.150943,91.330189,36.702083,128.0,88.496981,,24.679245,,20.784314,...,83.14,0,,,-0.03,1,2,1,1,0
1,2,97.0,95.0,36.702083,98.0,75.33,,19.0,,20.784314,...,83.14,0,,,-0.03,2,2,1,0,0
2,3,89.0,99.0,36.702083,122.0,86.0,,22.0,,20.784314,...,83.14,0,,,-0.03,3,2,0,0,0
3,4,90.0,95.0,36.702083,122.0,86.0,,30.0,,24.0,...,83.14,0,,,-0.03,4,2,1,0,0
4,5,103.0,88.5,36.702083,122.0,91.33,,24.5,,24.0,...,83.14,0,,,-0.03,5,2,1,1,0
5,6,110.0,91.0,36.702083,122.0,91.33,,22.0,,24.0,...,83.14,0,,,-0.03,6,2,0,1,0
6,7,108.0,92.0,36.11,123.0,77.0,,29.0,,24.0,...,83.14,0,,,-0.03,7,2,1,1,0
7,8,106.0,90.5,36.11,93.0,76.33,,29.0,,24.0,...,83.14,0,,,-0.03,8,2,1,1,0
8,9,104.0,95.0,36.11,133.0,88.33,,26.0,,24.0,...,83.14,0,,,-0.03,9,2,1,1,0
9,10,102.0,91.0,36.11,134.0,87.33,,30.0,,24.0,...,83.14,0,,,-0.03,10,2,1,1,0


# Getting data on board

In [None]:

d1 = pd.read_csv('/kaggle/input/sepsis data/p000001.csv')
Data = process_patient_data(d1)
Data


In [5]:

for i in range(2,11001):
    filename = "/kaggle/input/sepsis data/p0"+to_str(i)+".csv"
    df = pd.read_csv(filename)
    Data = pd.concat([Data, df])

Data.to_csv("/kaggle/Prep_data.csv")


In [6]:
dataset = Data.copy()

In [7]:
# imputer = KNNImputer(n_neighbors=3)
# dataset = pd.DataFrame(imputer.fit_transform(dataset))

for col in dataset.columns[:36]:
        dataset[col].fillna(dataset[col].mean(), inplace = True)
for col in dataset.columns:
        dataset[col].fillna(0, inplace = True)

In [8]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler(feature_range=(0, 10))
# dataset = scaler.fit_transform(dataset)
dataset.drop('Unnamed: 0', axis=1, inplace = True)
dataset

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SOFA_score,qSOFA,Sirs,SepsisLabel
0,102.150943,91.330189,36.702083,128.0,88.496981,59.935607,24.679245,0.0,20.784314,46.184211,...,83.14,0,0.0,0.0,-0.03,1,2.0,1.0,1.0,0
1,97.000000,95.000000,36.702083,98.0,75.330000,59.935607,19.000000,0.0,20.784314,46.184211,...,83.14,0,0.0,0.0,-0.03,2,2.0,1.0,0.0,0
2,89.000000,99.000000,36.702083,122.0,86.000000,59.935607,22.000000,0.0,20.784314,46.184211,...,83.14,0,0.0,0.0,-0.03,3,2.0,0.0,0.0,0
3,90.000000,95.000000,36.702083,122.0,86.000000,59.935607,30.000000,0.0,24.000000,46.184211,...,83.14,0,0.0,0.0,-0.03,4,2.0,1.0,0.0,0
4,103.000000,88.500000,36.702083,122.0,91.330000,59.935607,24.500000,0.0,24.000000,46.184211,...,83.14,0,0.0,0.0,-0.03,5,2.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,59.000000,98.000000,37.030858,103.0,71.000000,59.935607,18.000000,0.0,-0.663857,24.151601,...,44.70,0,0.0,1.0,-0.03,20,0.0,0.0,0.0,0
20,73.000000,100.000000,37.030858,97.0,67.000000,59.935607,18.000000,0.0,-0.663857,24.151601,...,44.70,0,0.0,1.0,-0.03,21,0.0,0.0,0.0,0
21,77.000000,99.000000,37.030858,102.0,76.000000,59.935607,17.000000,0.0,-0.663857,24.151601,...,44.70,0,0.0,1.0,-0.03,22,0.0,0.0,0.0,0
22,74.000000,100.000000,37.030858,105.0,78.000000,59.935607,16.000000,0.0,-0.663857,24.151601,...,44.70,0,0.0,1.0,-0.03,23,0.0,0.0,0.0,0


In [9]:
dataset = np.array(dataset)
dataset   #final data after preprocessing

array([[102.1509434 ,  91.33018868,  36.70208333, ...,   1.        ,
          1.        ,   0.        ],
       [ 97.        ,  95.        ,  36.70208333, ...,   1.        ,
          0.        ,   0.        ],
       [ 89.        ,  99.        ,  36.70208333, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [ 77.        ,  99.        ,  37.03085825, ...,   0.        ,
          0.        ,   0.        ],
       [ 74.        , 100.        ,  37.03085825, ...,   0.        ,
          0.        ,   0.        ],
       [ 68.        ,  99.        ,  36.06      , ...,   0.        ,
          0.        ,   0.        ]])

 ## spliting in train and test data

In [10]:
train_size = int(len(dataset) * 0.75)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

319758 106587


In [11]:

trainX, trainY = train[:,:43] , train[:,43:]
testX, testY = test[:,:43] , test[:,43:]

# reshape input to be [samples, time steps, features]

trainX = trainX.reshape(train_size, trainX.shape[1], 1)
testX = testX.reshape(test_size, testX.shape[1], 1)

In [12]:
print(trainX.shape)
trainX

(319758, 43, 1)


array([[[102.1509434 ],
        [ 91.33018868],
        [ 36.70208333],
        ...,
        [  2.        ],
        [  1.        ],
        [  1.        ]],

       [[ 97.        ],
        [ 95.        ],
        [ 36.70208333],
        ...,
        [  2.        ],
        [  1.        ],
        [  0.        ]],

       [[ 89.        ],
        [ 99.        ],
        [ 36.70208333],
        ...,
        [  2.        ],
        [  0.        ],
        [  0.        ]],

       ...,

       [[ 85.07433839],
        [ 97.26810958],
        [ 37.03085825],
        ...,
        [  0.        ],
        [  0.        ],
        [  0.        ]],

       [[ 85.07433839],
        [ 97.26810958],
        [ 37.03085825],
        ...,
        [  0.        ],
        [  0.        ],
        [  0.        ]],

       [[ 85.07433839],
        [ 97.26810958],
        [ 37.03085825],
        ...,
        [  0.        ],
        [  0.        ],
        [  0.        ]]])

In [13]:
print(trainY.shape)
trainY

(319758, 1)


array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [1]:
#training using bidirectional lstm 

In [1]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=( 44, 1), merge_mode='concat'))
# model.add(Bidirectional(LSTM(64)))
opt = SGD(lr=0.01, momentum=0.3)
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')
history = model.fit(trainX, trainY, epochs=30, batch_size=32, verbose=0, validation_data=(testX, testY))

NameError: name 'Sequential' is not defined

In [None]:
history
show_history(history)
testPredict = model.predict(testX)

In [None]:
testPredict[43]