In [27]:
#%%
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier, XGBRegressor
from pathlib import Path

VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
# TESTS = TESTS[0:1]


def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    score = np.mean([task1, task2, task3])
    print(task1, task2, task3)
    return score

#%%
#import csv as Dataframe
data_train = pd.read_csv('train_features.csv')
label_train = pd.read_csv('train_labels.csv')
data_test = pd.read_csv('test_features.csv')


In [28]:
data_train

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1,3,34.0,,,12.0,,36.0,8.7,24.0,...,,100.0,,114.0,24.6,94.0,,,142.0,7.33
1,1,4,34.0,,,,,36.0,,,...,,100.0,,,,99.0,,,125.0,7.33
2,1,5,34.0,,,,,36.0,,,...,,100.0,,,,92.0,,,110.0,7.37
3,1,6,34.0,,,,,37.0,,,...,,100.0,,,,88.0,,,104.0,7.37
4,1,7,34.0,,,,,,,,...,,100.0,,,22.4,81.0,,,100.0,7.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227935,9999,8,85.0,,,,,,,,...,,,,,,80.0,,,110.0,
227936,9999,9,85.0,,,,,,,,...,,,,,,83.0,,,123.0,
227937,9999,10,85.0,,,,,36.0,,,...,,98.0,,,,80.0,,,138.0,
227938,9999,11,85.0,,,,,,10.2,,...,,98.0,,,31.0,75.0,,,125.0,


In [29]:
label_train[TESTS[0]].sum()/label_train.shape[0]

data_train = data_train.groupby('pid').apply(lambda group: group.interpolate(method='index'))
data_test = data_test.groupby('pid').apply(lambda group: group.interpolate(method='index'))
data_train

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1,3,34.0,,,12.0,,36.0,8.7000,24.00,...,,100.000000,,114.000,24.6000,94.0,,,142.0,7.33
1,1,4,34.0,,,12.0,,36.0,8.6750,24.25,...,,100.000000,,113.625,24.0500,99.0,,,125.0,7.33
2,1,5,34.0,,,12.0,,36.0,8.6500,24.50,...,,100.000000,,113.250,23.5000,92.0,,,110.0,7.37
3,1,6,34.0,,,12.0,,37.0,8.6250,24.75,...,,100.000000,,112.875,22.9500,88.0,,,104.0,7.37
4,1,7,34.0,,,12.0,,37.0,8.6000,25.00,...,,100.000000,,112.500,22.4000,81.0,,,100.0,7.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227935,9999,8,85.0,,36.4,30.0,,36.0,9.6375,25.00,...,,96.666667,,107.000,29.8375,80.0,,,110.0,
227936,9999,9,85.0,,36.4,30.0,,36.0,9.8250,25.00,...,,97.333333,,107.000,30.2250,83.0,,,123.0,
227937,9999,10,85.0,,36.4,30.0,,36.0,10.0125,25.00,...,,98.000000,,107.000,30.6125,80.0,,,138.0,
227938,9999,11,85.0,,36.4,30.0,,36.0,10.2000,25.00,...,,98.000000,,107.000,31.0000,75.0,,,125.0,


In [30]:
def get_representative(df, method):
    grouped = df.groupby('pid')
    if method == 'mean':
        result = grouped.mean()
        result.fillna(df.mean(), inplace=True)
    elif method == 'median':
        result = grouped.median()
        result.fillna(df.median(), inplace=True)

    else:
        raise ValueError('method undefined')
    result.columns = [col+'_'+method for col in result.columns]
    return result.reset_index(drop=False)



In [31]:
def get_std(df):
    df = df[[col for col in df.columns if col!='Time']]
    new_feature = df.groupby('pid').std().fillna(0) # Wei thinks it reasonable to fill 0 here
    new_feature.columns = [col+'_std' if col!='pid' else col for col in new_feature.columns]
    return new_feature.reset_index(drop=False)

In [32]:
def get_max(df, mode='max', fill_method='median'):
    df = df[[col for col in df.columns if col!='Time']]
    grouped = df.groupby('pid')
    if fill_method == 'median':
        default = df.median()
    if mode == 'max':
        new_feature = grouped.max().fillna(default)
        new_feature.columns = [col+'_max' if col!='pid' else col for col in new_feature.columns]
    elif mode == 'min':
        new_feature = grouped.min().fillna(default)
        new_feature.columns = [col+'_min' if col!='pid' else col for col in new_feature.columns]
    else:
        raise ValueError("choose mode between max and min")
    return new_feature.reset_index(drop=False)

In [33]:
def count_nan(df):
    df = df[[col for col in df.columns if col!='Time']]
    grouped = df.groupby('pid')
    new_feature = grouped.apply(lambda x: x.isna().sum().sum())
    return pd.DataFrame(new_feature.rename('na_count')).reset_index(drop=False)

In [34]:
def count_oscillations(df, benchmark:pd.Series, mode):
    """
    Benchmark should be ginven as a Series, indexed by columns of the df
    For example, it could be df.median()

    mode should be either 'free' or 'fixed'.
    In case of "free", any columns with all zero oscillations among all groups would be dropped.
    Otherwise, the result keeps ALL columns.
    Don't forget to use the training data columns to select the columns in the test data set. 
    
    !!! USE FREE in the training process and FIXED while testing!!!
    """
    if not mode in ['free', 'fixed']:
        raise ValueError('choose a mode between free or fixed. choose free if this is a training process')
    def identify_and_count_osci(group):
        debenched_group = group-benchmark
        debenched_group.fillna(method='ffill')
        osci_id = debenched_group * debenched_group.shift(1) # oscilation is detected whenever negative values appears
        return (osci_id<0).sum()
    df = df[[col for col in df.columns if col!='Time']]
    grouped = df.groupby('pid')
    new_feature = grouped.apply(identify_and_count_osci)
    new_feature = new_feature[[col for col in new_feature.columns if col!='Time']]
    if mode == 'free':
        new_feature = new_feature[[col for col in new_feature.columns if new_feature[col].sum()>0]]
    else:
        new_feature[[col for col in new_feature.columns if col!='pid']]
        print('DO REMEMBER TO SELECT TEST DATA COLUMNS WITH TRAIN DATA COLUMNS')
    new_feature.columns = [col+'_socis' for col in new_feature.columns]
    new_feature.reset_index(drop=False, inplace=True)
    return new_feature
    

In [35]:
def my_merge_on_pid(*dfs):
    # import pdb; pdb.set_trace()
    result = dfs[0][['pid']]
    for df in dfs:
        result = result.merge(df, on='pid')
    return result

In [36]:
def get_full_char(df, train_test, repre_method='median', result_cols=None):
    if train_test not in ['train', 'test']:
        raise ValueError('Choose train_test between train and test. This is used to decide the mode in count_oscillations')
    if train_test=='test' and (result_cols is None):
        raise ValueError('please select test data columns with the training set')
    df_repre = get_representative(df, method=repre_method)
    df_std = get_std(df)
    df_max = get_max(df, mode='max')
    df_min = get_max(df, mode='min')
    df_count_nan = count_nan(df)
    df_osci = count_oscillations(df, benchmark=df.median(), mode=("free" if train_test=='train' else 'fixed'))
    result = my_merge_on_pid(df_repre, df_std, df_max, df_min, df_count_nan, df_osci)
    if train_test=='test':
        result = result[result_cols]
        if set(result.columns) != set(result_cols):
            raise ValueError('result_cols does not match')
        else:
            print("And I did!")
    return result

In [37]:
df_train = get_full_char(data_train, 'train', repre_method='median')

In [38]:
df_test = get_full_char(data_test, 'test', repre_method='median', result_cols=df_train.columns)

DO REMEMBER TO SELECT TEST DATA COLUMNS WITH TRAIN DATA COLUMNS
And I did!


In [39]:
#normalise the data between 0 and 1
scaler = MinMaxScaler()
scaler.fit(df_train.drop(columns=['pid']))

# to make sure that we are using the training lable in the correct order as df_train
full_train = pd.merge(label_train, df_train, on='pid', how='inner').sort_values('pid')
if len(set(label_train.columns).intersection(set(df_train.columns)))!=1:
    raise ValueError("the lable_train and df_train DataFrames shares abnormal column names")
label_train, df_train = full_train[label_train.columns], full_train[df_train.columns]


mod_data = scaler.transform(df_train.drop(columns=['pid']))
mod_test = scaler.transform(df_test.drop(columns=['pid']))

ORDERED_TEST_PID = df_test['pid']


In [40]:
# model= SVC(probability=True)
# y_test = [None for _ in TESTS]
# for i in range(len(TESTS)):
#     label_tests = label_train[TESTS[i]].to_numpy()
#     model.fit(mod_data, label_tests)
#     y_test[i] = model.predict_proba(mod_test)
    
# print(y_test)

In [41]:
def assemble_results(y_test: list, attributes:list, mode='classification'):
    if mode=='classification':
        result = pd.DataFrame([pd.DataFrame(y_test[i])[1].rename(attributes[i]) for i in range(len(y_test))]).T
    elif mode=='regression':
        result = pd.DataFrame([pd.DataFrame(y_test[i])[0].rename(attributes[i]) for i in range(len(y_test))]).T
    else:
        raise ValueError("choose mode from classification or regrassion")
    result['pid'] = ORDERED_TEST_PID
    return result

In [42]:
#install xgboost
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
y_test_1 = [None for _ in TESTS]
for i in range(len(TESTS)):
    label_tests = label_train[TESTS[i]].to_numpy()
    model.fit(mod_data, label_tests)
    y_test_1[i] = model.predict_proba(mod_test)
    
print(pd.DataFrame([pd.DataFrame(y_test_1[i])[0].describe().rename(TESTS[i]) for i in range(len(y_test_1))]))

    


                          count      mean       std       min       25%  \
LABEL_BaseExcess        12664.0  0.664474  0.330466  0.000102  0.411159   
LABEL_Fibrinogen        12664.0  0.842996  0.188775  0.000353  0.792286   
LABEL_AST               12664.0  0.763119  0.217351  0.000378  0.696857   
LABEL_Alkalinephos      12664.0  0.765684  0.220925  0.000271  0.700951   
LABEL_Bilirubin_total   12664.0  0.776054  0.208803  0.000077  0.728811   
LABEL_Lactate           12664.0  0.746212  0.229705  0.000106  0.675518   
LABEL_TroponinI         12664.0  0.772121  0.246697  0.001321  0.660952   
LABEL_SaO2              12664.0  0.770861  0.273853  0.000790  0.698108   
LABEL_Bilirubin_direct  12664.0  0.985260  0.068705  0.014025  0.992126   
LABEL_EtCO2             12664.0  0.954715  0.174952  0.000248  0.993218   

                             50%       75%       max  
LABEL_BaseExcess        0.821706  0.927643  0.999720  
LABEL_Fibrinogen        0.912995  0.970169  0.999565  
LABEL_AST

In [43]:
#y_test_stacked = np.stack([arr[:, 1] for arr in y_test_xgboost], axis=1)
#print(y_test_stacked)

result_1 = assemble_results(y_test_1, TESTS)


In [44]:
label_sepsis = label_train['LABEL_Sepsis'].to_numpy()
model.fit(mod_data, label_sepsis)

y_sepsis = model.predict_proba(mod_test)

print(y_sepsis)

#save the results in .csv
y_test_stacked = np.stack([arr[:, 1] for arr in [y_sepsis]], axis=1)
pd.DataFrame(y_sepsis).to_csv("resultInterpo.csv", header=None, index=None)
pd.DataFrame(y_sepsis)[0].describe().rename("LABEL_Sepsis")


[[0.9887821  0.0112179 ]
 [0.99355745 0.00644255]
 [0.9854014  0.0145986 ]
 ...
 [0.93270385 0.06729613]
 [0.9969052  0.00309477]
 [0.96529084 0.03470918]]


count    12664.000000
mean         0.962317
std          0.070795
min          0.039328
25%          0.962606
50%          0.984329
75%          0.992943
max          0.999837
Name: LABEL_Sepsis, dtype: float64

In [45]:
result_2 = assemble_results([y_sepsis], ["LABEL_Sepsis"])
result_2


Unnamed: 0,LABEL_Sepsis,pid
0,0.011218,0
1,0.006443,3
2,0.014599,5
3,0.001601,7
4,0.004709,9
...,...,...
12659,0.004406,31647
12660,0.010459,31649
12661,0.067296,31651
12662,0.003095,31652


In [46]:
# Subtask 3
model = XGBRegressor()
y_test_3 = [None for _ in VITALS]

for i in range(len(VITALS)):
    label_vitals = label_train[VITALS[i]].to_numpy()
    model.fit(mod_data, label_vitals)
    y_test_3[i] = model.predict(mod_test)
    
print(pd.DataFrame([pd.DataFrame(y_test_3[i])[0].describe().rename(VITALS[i]) for i in range(len(y_test_3))]))

                   count       mean        std        min        25%  \
LABEL_RRate      12664.0  18.926367   2.508752   6.116487  17.291480   
LABEL_ABPm       12664.0  82.190292  10.003599  37.117935  75.022661   
LABEL_SpO2       12664.0  97.028702   1.386926  66.114067  96.266953   
LABEL_Heartrate  12664.0  84.128494  12.088386  44.417557  75.702778   

                       50%        75%         max  
LABEL_RRate      18.581676  20.247764   32.383053  
LABEL_ABPm       81.063896  88.102169  125.130310  
LABEL_SpO2       97.148663  97.957872  100.511528  
LABEL_Heartrate  83.630100  91.996450  132.710922  


In [47]:
result_3 = assemble_results(y_test_3, VITALS, mode='regression')

In [48]:
result = pd.concat([result_1.set_index('pid'), result_2.set_index('pid'), result_3.set_index('pid')] ,axis=1).reset_index(drop=False)
result

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.997287,0.060198,0.938479,0.848257,0.969444,0.225589,0.069110,0.367288,0.021227,0.000836,0.011218,13.890991,86.016632,99.527824,89.605682
1,3,0.290732,0.354087,0.204996,0.136930,0.102386,0.161700,0.496047,0.038160,0.001190,0.014401,0.006443,17.068874,81.230774,97.245277,92.576202
2,5,0.089802,0.509423,0.170158,0.101472,0.194781,0.134960,0.165679,0.094388,0.001263,0.004134,0.014599,18.886738,73.599373,95.531258,69.964409
3,7,0.997033,0.926735,0.991182,0.996383,0.981564,0.320754,0.014971,0.700349,0.274932,0.004167,0.001601,16.547239,92.403847,97.979523,88.265747
4,9,0.184695,0.050244,0.328015,0.306435,0.123900,0.059854,0.026679,0.030713,0.000260,0.000227,0.004709,20.226013,91.337921,96.665764,94.331017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,31647,0.061043,0.112846,0.178918,0.068191,0.152859,0.066085,0.013367,0.013896,0.008203,0.000183,0.004406,16.402905,68.809837,97.040710,71.492851
12660,31649,0.885303,0.028581,0.614593,0.505235,0.521678,0.766665,0.255125,0.196860,0.007361,0.002078,0.010459,16.486376,83.973259,96.923500,96.411850
12661,31651,0.864851,0.049909,0.153308,0.527436,0.418658,0.072934,0.146100,0.154280,0.003212,0.001054,0.067296,19.226627,74.215561,99.116592,83.077446
12662,31652,0.009845,0.019937,0.041975,0.043883,0.245567,0.113410,0.017505,0.004032,0.001886,0.005651,0.003095,18.730450,92.535973,97.593262,115.717842


In [49]:
cols = pd.read_csv('sample.csv',nrows=1).columns
if set(result.columns)!=set(cols):
    raise ValueError('result columns does not coincide with target')
result = result[cols] # reorder columns to make sure our submission is in the desired form


In [51]:
def archive_existing_result():
    cwd = Path.cwd()
    archive_dir = Path(cwd, "archived_predictions")
    if not archive_dir.exists():
        archive_dir.mkdir()
    if Path(cwd,'prediction.zip').exists():
        i = len(list(archive_dir.glob("prediction*.zip")))
        Path(cwd,'prediction.zip').rename(Path(archive_dir, f'prediction{i+1}.zip'))
    return None

# %%
def generate_submission(result):
    archive_existing_result()
    result.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')
    return None

In [52]:
generate_submission(result)

In [24]:
# filename = 'sample.csv'
# df_submission = pd.read_csv(filename)
# print(df_submission)

# # generate a baseline based on sample.zip
# df_true = pd.read_csv(filename)
# for label in TESTS + ['LABEL_Sepsis']:
#     # round classification labels
#     df_true[label] = np.around(df_true[label].values)

# print('Score of sample.zip with itself as groundtruth', get_score(df_true, df_submission))


In [25]:
# suppose df is a pandas dataframe containing the result
#df.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')