In [55]:
import numpy as np
import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
import tqdm 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import pickle

In [56]:
def load_challenge_data(file):
    with open(file, 'r') as f:
        header = f.readline().strip()
        column_names = header.split('|')
        data = np.loadtxt(f, delimiter='|')

#     # Ignore SepsisLabel column if present.
#     if column_names[-1] == 'SepsisLabel':
#         column_names = column_names[:-1]
#         data = data[:, :-1]

    return data

In [57]:
def save_challenge_predictions(file, scores, labels):
    with open(file, 'w') as f:
        f.write('PredictedProbability|PredictedLabel\n')
        for (s, l) in zip(scores, labels):
            f.write('%g|%d\n' % (s, l))

In [58]:
def get_files(inpt_directory):
    files = []
    for f in os.listdir(inpt_directory):
        if os.path.isfile(os.path.join(inpt_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('psv'):
            files.append(f)
    return files

In [59]:
def load_data(inpt_directory):
    x = []
    y = []

    files = []
    for f in os.listdir(inpt_directory):
        if os.path.isfile(os.path.join(inpt_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('psv'):
            files.append(f)

    print('Reading files...')
    num_files = len(files)

    for i,f in enumerate(files):
        input_file = os.path.join(inpt_directory, f)
        data = load_challenge_data(input_file)
        y.append(data[:,-1])
        x.append(data[:,:-1])

    x = np.array(x)
    y = np.array(y)

    return x,y

In [60]:
#
input_directory = "./sepsis_data/train"

xTrain, yTrain = load_data(input_directory)
print(xTrain.shape)
print(yTrain.shape)

Reading files...
(28234,)
(28234,)




In [61]:
def missing_value_imputation(data):
    df = pd.DataFrame(data)
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    return df

In [62]:
n = xTrain.shape[0]

new_XTrain = []
new_YTrain = []
for idx in tqdm.tqdm(range(n)):
    record = xTrain[idx]
    imputated = missing_value_imputation(record)
    for j in range(imputated.shape[0]):
        new_XTrain.append(imputated.loc[j,:])
        new_YTrain.append(yTrain[idx][j])

new_XTrain = np.array(new_XTrain)
new_XTrain = np.nan_to_num(new_XTrain)
new_YTrain = np.array(new_YTrain)

100%|██████████| 28234/28234 [01:38<00:00, 286.85it/s]


In [9]:
# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# imp = imp.fit(new_XTrain)
# new_XTrain = imp.transform(new_XTrain)

In [63]:
np.mean(new_XTrain,axis=0)

array([ 8.42581490e+01,  9.71503540e+01,  3.66763621e+01,  1.22874056e+02,
        8.25452344e+01,  5.33322001e+01,  1.86384461e+01,  3.38655353e+00,
       -4.23147217e-02,  1.26203603e+01,  2.43276230e-01,  3.87112525e+00,
        2.07199861e+01,  3.46390919e+01,  5.50923259e+01,  2.22092884e+01,
        3.79324025e+01,  7.27455526e+00,  5.80359381e+01,  1.40047108e+00,
        1.06302751e-01,  1.28353074e+02,  7.16157176e-01,  1.85006575e+00,
        2.67155108e+00,  3.96554153e+00,  6.29336732e-01,  1.02547947e+00,
        3.06139571e+01,  1.02114001e+01,  2.03698281e+01,  1.06732081e+01,
        4.40202709e+01,  1.98588457e+02,  6.19712180e+01,  5.58426613e-01,
        3.01380499e-01,  3.03288356e-01, -5.54988678e+01,  2.70239639e+01])

In [11]:
# model = RandomForestClassifier(n_estimators=100,class_weight='balanced')
# model.fit(new_XTrain, new_YTrain)

RandomForestClassifier(class_weight='balanced')

In [64]:
model = xgb.XGBClassifier()
model.fit(new_XTrain, new_YTrain)

y_pred = model.predict(new_XTrain)
predictions = [round(value) for value in y_pred]

In [65]:
# evaluate predictions
accuracy = accuracy_score(new_YTrain, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 98.79%


In [66]:
np.sum(new_YTrain)

19543.0

In [67]:
np.sum(predictions)

7190

In [68]:
confusion_matrix(new_YTrain,predictions)

array([[1064543,     377],
       [  12730,    6813]])

In [69]:
filename = './my_model_xgb.pkl'
pickle.dump(model, open(filename, 'wb'))

In [70]:
#
input_directory_test = "./sepsis_data/test"

xTest, yTest = load_data(input_directory_test)
print(xTest.shape)
print(yTest.shape)

Reading files...
(8069,)
(8069,)




In [71]:
files = get_files("./sepsis_data/test")

In [72]:
output_directory = "./model_save_xgb_test2/"

In [73]:
if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

In [74]:
len(files)

8069

In [75]:
n = xTest.shape[0]


for idx in tqdm.tqdm(range(n)):
    new_XTest = []
    new_YTest = []
    record = xTest[idx]
    imputated = missing_value_imputation(record)
    for j in range(imputated.shape[0]):
        new_XTest.append(np.nan_to_num(imputated.loc[j,:]))
        new_YTest.append(yTest[idx][j])
    y_pred_test = model.predict(np.array(new_XTest))
    scores = model.predict_proba(np.array(new_XTest))
#     print(np.shape(scores))
    
    labels = y_pred_test
    scores_res = []
    for i,l in enumerate(labels):
        scores_res.append(scores[i,1])
#     print(scores_res)
    output_file = os.path.join(output_directory, files[idx])
#     print(np.shape(scores_res))
#     print(labels)
    save_challenge_predictions(output_file,scores_res,labels)
# new_XTest = np.array(new_XTest)
# new_YTest = np.array(new_YTest)

100%|██████████| 8069/8069 [01:09<00:00, 116.11it/s]


In [76]:
n = xTest.shape[0]

new_XTest = []
new_YTest = []
for idx in tqdm.tqdm(range(n)):
    record = xTest[idx]
    imputated = missing_value_imputation(record)
    for j in range(imputated.shape[0]):
        new_XTest.append(imputated.loc[j,:])
        new_YTest.append(yTest[idx][j])

new_XTest = np.array(new_XTest)
new_XTest = np.nan_to_num(new_XTest)
new_YTest = np.array(new_YTest)

100%|██████████| 8069/8069 [00:29<00:00, 276.78it/s]


In [77]:
y_pred = model.predict(new_XTest)
predictions = [round(value) for value in y_pred]

In [78]:
confusion_matrix(new_YTest,predictions)

array([[306664,   1751],
       [  5167,    414]])

In [79]:
output_directory = "./model_xgb_save_train_res/"

In [80]:
if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

In [81]:
files = get_files("./sepsis_data/train")

In [82]:
n = xTrain.shape[0]


for idx in tqdm.tqdm(range(n)):
    new_XTrain = []
    new_YTrain = []
    record = xTrain[idx]
    imputated = missing_value_imputation(record)
    for j in range(imputated.shape[0]):
        new_XTrain.append(np.nan_to_num(imputated.loc[j,:]))
        new_YTrain.append(yTrain[idx][j])
    y_pred_test = model.predict(np.array(new_XTrain))
    scores = model.predict_proba(np.array(new_XTrain))
#     print(np.shape(scores))
    
    labels = y_pred_test
    scores_res = []
    for i,l in enumerate(labels):
#         scores_res.append(scores[i,int(l)])
        scores_res.append(scores[i,1])
        
#     print(scores_res)
    output_file = os.path.join(output_directory, files[idx])
#     print(np.shape(scores_res))
#     print(labels)
    save_challenge_predictions(output_file,scores_res,labels)
# new_XTest = np.array(new_XTest)
# new_YTest = np.array(new_YTest)

100%|██████████| 28234/28234 [04:01<00:00, 116.82it/s]
