# Models for Sepsis prediction before 12 hours

In [35]:
import numpy as np
import pandas as pd

In [36]:
data=pd.read_csv("total.csv")
df=pd.read_csv("test.csv")

In [27]:
data.drop(columns=data.columns[0], axis=1, inplace=True)

**Sepsis shift and imputation using forward fill**

In [37]:
data["sepsis_shift"]= data.groupby('Patient_ID')['sepsis_label'].shift(-6)
df["sepsis_shift"]= df.groupby('Patient_ID')['sepsis_label'].shift(-6)

In [38]:
data["sepsis_shift"] = data["sepsis_shift"].fillna(method='ffill')
df["sepsis_shift"] = df["sepsis_shift"].fillna(method='ffill')

In [39]:
X = data.drop(["sepsis_label","Patient_ID","sepsis_shift"],axis=1)
y = data["sepsis_shift"]

**Sampling**

In [40]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_over,y_train_over = oversample.fit_resample(X, y)

In [36]:
from imblearn.under_sampling import NearMiss
nr = NearMiss()
X_train_over, y_train_over = nr.fit_resample(X, y.ravel())

In [41]:
X_val = df.drop(["sepsis_label","Patient_ID","sepsis_shift"],axis=1)
y_val=df["sepsis_shift"]

**Standard Scaler**

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Instantiate encoder/scaler
scaler = StandardScaler()


# train data
# Scale and Encode Separate Columns
train_scaled_columns  = scaler.fit_transform(X_train_over) 
train_x=scaler.fit_transform(X_train_over)
#train_Y=scaler.fit_transform(y_train_over)
#test_Y=scaler.fit_transform(y_val)
test_x=scaler.fit_transform(X_val)

**Logistic Regression**

In [27]:
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False).fit(train_x,y_train_over)

In [28]:
confusion_matrix(y_val, model.predict(test_x))

array([[51961, 44667],
       [  768,  2604]], dtype=int64)

In [29]:
print(classification_report(y_val, model.predict(test_x)))

              precision    recall  f1-score   support

         0.0       0.99      0.54      0.70     96628
         1.0       0.06      0.77      0.10      3372

    accuracy                           0.55    100000
   macro avg       0.52      0.65      0.40    100000
weighted avg       0.95      0.55      0.68    100000



Accuracy of 54% with recall of 77% for sepsis patients. Oversampling does better

**Random Forest**

In [51]:
from sklearn.ensemble import RandomForestClassifier #undersample does better
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(train_x, y_train_over)
y_pred = clf.predict(test_x)

In [52]:
confusion_matrix(y_val, y_pred)

array([[36522, 60106],
       [ 1095,  2277]], dtype=int64)

In [53]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.38      0.54     96628
         1.0       0.04      0.68      0.07      3372

    accuracy                           0.39    100000
   macro avg       0.50      0.53      0.31    100000
weighted avg       0.94      0.39      0.53    100000



Accuracy of 39% with recall of 68% for sepsis patients. Undersampling does better

**LightGBM**

In [38]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [39]:
model = LGBMClassifier() #undersample does better
model.fit(train_x, y_train_over)
pred = model.predict(test_x)
accuracy = model.score(test_x, y_val)

In [40]:
confusion_matrix(y_val, pred)

array([[32835, 63793],
       [  424,  2948]], dtype=int64)

In [41]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

         0.0       0.99      0.34      0.51     96628
         1.0       0.04      0.87      0.08      3372

    accuracy                           0.36    100000
   macro avg       0.52      0.61      0.29    100000
weighted avg       0.96      0.36      0.49    100000



Accuracy of 36% with recall of 86% for sepsis patients. Undersampling does better

**Extra Trees Classifier**

In [42]:
from sklearn.ensemble import ExtraTreesClassifier #undersample does better
extra_tree_forest = ExtraTreesClassifier(
                                        criterion ='entropy')
extra_tree_forest.fit(train_x, y_train_over)
pred = extra_tree_forest.predict(test_x)
accuracy = extra_tree_forest.score(test_x, y_val)

In [43]:
confusion_matrix(y_val, pred)

array([[38380, 58248],
       [  528,  2844]], dtype=int64)

In [44]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

         0.0       0.99      0.40      0.57     96628
         1.0       0.05      0.84      0.09      3372

    accuracy                           0.41    100000
   macro avg       0.52      0.62      0.33    100000
weighted avg       0.95      0.41      0.55    100000



Accuracy of 41% with recall of 84% for sepsis patients. Undersampling does better

# Gaussian Naive bayes with 13 features

In [19]:
#the best
from sklearn.naive_bayes import GaussianNB #oversample does better
gnb = GaussianNB()
gnb.fit(train_x, y_train_over)
y_pred = gnb.predict(test_x)

In [20]:
confusion_matrix(y_val, y_pred)

array([[74494, 22134],
       [ 1450,  1922]], dtype=int64)

In [21]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.77      0.86     96628
         1.0       0.08      0.57      0.14      3372

    accuracy                           0.76    100000
   macro avg       0.53      0.67      0.50    100000
weighted avg       0.95      0.76      0.84    100000



Accuracy of 76% with recall of 57% for sepsis patients. Undersampling does better

**K Means**

In [44]:
from sklearn.cluster import KMeans
model1=KMeans(n_clusters=2, random_state=0).fit(train_x)
kmeans_pred=model1.fit_predict(test_x)

In [45]:
confusion_matrix(y_val, kmeans_pred)

array([[46313, 50315],
       [ 1862,  1510]], dtype=int64)

In [46]:
print(classification_report(y_val, kmeans_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.48      0.64     96628
         1.0       0.03      0.45      0.05      3372

    accuracy                           0.48    100000
   macro avg       0.50      0.46      0.35    100000
weighted avg       0.93      0.48      0.62    100000



Accuracy of 48% with recall of 45% for sepsis patients. oversampling does better

# Support Vector Machines in pre-processed data by LDA with 13 features

In [7]:
X = data.drop(["sepsis_label","Patient_ID","HR","O2Sat","Temp","SBP","MAP","DBP","Resp","BaseExcess","PaCO2","Calcium","Chloride",
               "Lactate","Glucose","Phosphate","Hct","Hgb","PTT","Age","Platelets","Temp_diff","HR_diff","HCO3","SaO2","pH"
              ,"SBP_diff","MAP_diff","DBP_diff","SPO2_diff","Creatinine","Bilirubin_total","sepsis_shift"],axis=1)
y = data["sepsis_shift"]

In [8]:
from imblearn.under_sampling import NearMiss
nr = NearMiss()
X_train_over, y_train_over = nr.fit_resample(X, y.ravel())

In [9]:
X_val = df.drop(["sepsis_label","Patient_ID","HR","O2Sat","Temp","SBP","MAP","DBP","Resp","BaseExcess","PaCO2","Calcium","Chloride",
               "Lactate","Glucose","Phosphate","Hct","Hgb","PTT","Age","Platelets","Temp_diff","HR_diff","HCO3","SaO2","pH"
              ,"SBP_diff","MAP_diff","DBP_diff","SPO2_diff","Creatinine","Bilirubin_total","sepsis_shift"],axis=1)
y_val=df["sepsis_shift"]

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Instantiate encoder/scaler
scaler = StandardScaler()


# train data
# Scale and Encode Separate Columns
train_scaled_columns  = scaler.fit_transform(X_train_over) 
train_x=scaler.fit_transform(X_train_over)
#train_Y=scaler.fit_transform(y_train_over)
#test_Y=scaler.fit_transform(y_val)
test_x=scaler.fit_transform(X_val)

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA()
model=lda.fit(train_x, y_train_over)
y_pred=model.predict(test_x)
X_train_lda = lda.transform(train_x)
X_test_lda = lda.transform(test_x)

In [12]:
from sklearn.svm import SVC  
clf = SVC(C=1000,kernel='poly') 

In [13]:
model=clf.fit(X_train_lda, y_train_over)

In [14]:
pred=model.predict(X_test_lda)

In [15]:
confusion_matrix(y_val, pred)

array([[83724, 12904],
       [ 1918,  1454]], dtype=int64)

In [16]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.87      0.92     96628
         1.0       0.10      0.43      0.16      3372

    accuracy                           0.85    100000
   macro avg       0.54      0.65      0.54    100000
weighted avg       0.95      0.85      0.89    100000



**Accuracy of 85% with recall of 43% for sepsis patients**