#Preprocessing

In [76]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer


In [77]:
#equation for cleaning
def avg_val(x):
  if len(x) != 0:
    try:
      float_list = [float(i) for i in x]
      avg = sum(float_list) / len(x)
      return avg
    except:
      return x[len(x)-1]
  #for empty list
  return 0

def change_age(x):
  if x == '> 89':
    return '90'
  return x

def reduce_list(x):
  if len(x) == 0:
    return 0
  return x[0]

def fill_na_age(x):
  if len(x) == 0:
    return '30'
  return x

def fill_na_GCS(x):
  if len(x) == 0:
    return 15.0
  return x

def fill_na_gender(x):
  if len(x) == 0:
    return ['Male']  
  return x

def fill_na_ethnicity(x):
  if len(x) == 0:
    return ['Caucasian']
  return x

def fill_na_Capillary(x):
  if len(x) == 0:
    return ['normal']
  return x

def fix_GCS(x):
  if x == 'Unable to score due to medication':
    return 15.0
  return x

def reduce_list2(x):
  if len(x) == 0:
    return np.nan
  return x[0]

#medical data 
def BMI_cal(mass, height):
    #change heigh from cm to m 
    height = height / 100
    BMI = mass / (height * height)
    return BMI
##############################################################

In [78]:
def load_data(x_path):
    # Your code here
    df = pd.read_csv(x_path, index_col=0)
    x = df
    return x

def preprocess_x(df):
  ##for some columns specifically, remove row if data is empty
  #nursingchartvalue
  for index, val in df['nursingchartvalue'].iteritems():
    if pd.isna(val) and not(pd.isna(df.loc[index]['nursingchartcelltypevalname'])):
      #print(df.loc[index]['patientunitstayid'])
      df.drop(index, inplace = True)
  #labresult
  for index, val in df['labresult'].iteritems():
    if not(pd.isna(df.loc[index]['labname'])) and pd.isna(val):
      #print(df.loc[index]['patientunitstayid'])
      df.drop(index, inplace = True)

  # row is id number, columns are measuremnet, each cell holds a list
  df = df.groupby('patientunitstayid').agg(lambda x: list(x))

  #clean out nan value
  for index, row in df.iterrows():
      for column in df.columns:
          row[column] = [x for x in row[column] if str(x) != 'nan']
  
  #make columns for unique measurement ( labname, nurshing chartcelltypevalname, cellable)
  #labname
  lab_lst = []
  for val in df['labname']:
    for x in val:
      lab_lst.append(x)
  labname_vals = np.unique(lab_lst)
  #nursingchartcelltypevalname
  nurs_lst = []
  for val in df['nursingchartcelltypevalname']:
    for x in val:
      nurs_lst.append(x)
  nursing_vals = np.unique(nurs_lst)
  #celllabel
  cell_lst = []
  for val in df['celllabel']:
    for x in val:
      cell_lst.append(x)
  celllabel_vals = np.unique(cell_lst)

  #make columns for unique measurement
  #labname
  for i in labname_vals:
    df[i] = ''
    df[i] = df[i].apply(list)
  #nursingchartcelltypevalname
  for i in nursing_vals:
    df[i] = ''
    df[i] = df[i].apply(list)
  #celllabel
  for i in celllabel_vals:
    df[i] = ''
    df[i] = df[i].apply(list)

  #clean data
  df['gender'] = df['gender'].apply(fill_na_gender)
  df['age'] = df['age'].apply(fill_na_age)
  df['ethnicity'] = df['ethnicity'].apply(fill_na_ethnicity)

  # do flatten lists on admissionweight, admissionheight, age, gender, ethnicity, unitvisitnumber,patientunitstayid_dupe
  df["admissionheight"] = df["admissionheight"].apply(reduce_list)
  df["admissionweight"] = df["admissionweight"].apply(reduce_list)

  df["age"] = df["age"].apply(reduce_list)
  df["gender"] = df["gender"].apply(reduce_list)
  df["ethnicity"] = df["ethnicity"].apply(reduce_list)
  df["unitvisitnumber"] = df["unitvisitnumber"].apply(reduce_list)

  ########Group Data to new Columns #########
  for index, row in df.iterrows():
      #labname
      if(len(row['labname']) != 0 ):
        #if(len(row['labname']) != len(row['labresult'])):
          #for i in range(len(row['labname'])):
          # row[row['labname'][i]].append(row['labresult'][0])
        #add data
        #else:
        for i in range(len(row['labname'])):
          row[row['labname'][i]].append(row['labresult'][i])
      #nursingchartcelltypevalname
      if(len(row['nursingchartcelltypevalname']) != 0 ):
        #add data
        for i in range(len(row['nursingchartcelltypevalname'])):
          row[row['nursingchartcelltypevalname'][i]].append(row['nursingchartvalue'][i])
      #celllabel
      if(len(row['celllabel']) != 0 ):
        #add data
        for i in range(len(row['celllabel'])):
          row[row['celllabel'][i]].append(row['cellattributevalue'][i])

  
  #drop columns: cellattributevalue, celllabel, labmeasurenamesystem, labname, nursingchartcelltypevalname, nusrsingchartvalue, offset
  df.drop(columns = ['cellattributevalue', 'celllabel', 'labmeasurenamesystem', 'labname', 'nursingchartcelltypevalname', 'nursingchartvalue', 'offset', 'labresult'], inplace = True)  

  #clean capillary refill
  df['Capillary Refill'] = df['Capillary Refill'].apply(fill_na_Capillary)

  #clean the data ( get average measurements and clean age)
  df['glucose'] = df['glucose'].apply(avg_val)
  df['pH'] = df['pH'].apply(avg_val)
  df['GCS Total'] = df['GCS Total'].apply(avg_val)
  df['Heart Rate'] = df['Heart Rate'].apply(avg_val)
  df['Invasive BP Diastolic'] = df['Invasive BP Diastolic'].apply(avg_val)
  df['Invasive BP Mean'] = df['Invasive BP Mean'].apply(avg_val)
  df['Invasive BP Systolic'] = df['Invasive BP Systolic'].apply(avg_val)
  df['Non-Invasive BP Diastolic'] = df['Non-Invasive BP Diastolic'].apply(avg_val)
  df['Non-Invasive BP Mean'] = df['Non-Invasive BP Mean'].apply(avg_val)
  df['Non-Invasive BP Systolic'] = df['Non-Invasive BP Systolic'].apply(avg_val)
  df['O2 Saturation'] = df['O2 Saturation'].apply(avg_val)
  df['Respiratory Rate'] = df['Respiratory Rate'].apply(avg_val)
  df['Capillary Refill'] = df['Capillary Refill'].apply(avg_val)

  df['age'] = df['age'].apply(change_age)

  #change non num measurement using one hot encode
  gender_cols = pd.get_dummies(df['gender'], prefix='gender')
  ethicity_cols = pd.get_dummies(df['ethnicity'], prefix='ethnicity')
  capillary_cols = pd.get_dummies(df['Capillary Refill'], prefix='Capillary Refill')
  # add back to orignal df
  df = pd.concat([df, gender_cols, ethicity_cols, capillary_cols], axis=1)
  #drop old columns: ethnicity, gender, Capillary Refill
  df.drop(columns = ['ethnicity', 'gender','Capillary Refill'], inplace = True)

  #check if some cols exist then proceed
  if 'ethnicity_Native American' not in df.columns:
    df['ethnicity_Native American'] = 0.0
  if 'ethnicity_Other/Unknown' not in df.columns:
    df['ethnicity_Other/Unknown'] = 0.0
  if 'Capillary Refill_< 2 seconds' not in df.columns:
    df['Capillary Refill_< 2 seconds'] = 0.0
  if 'Capillary Refill_< 2 seconds' not in df.columns:
    df['Capillary Refill_> 2 seconds'] = 0.0
  if 'Capillary Refill_feet' not in df.columns:
    df['Capillary Refill_feet'] = 0.0
  if 'Capillary Refill_hands' not in df.columns:
    df['Capillary Refill_hands'] = 0.0
  if 'Capillary Refill_normal' not in df.columns:
    df['Capillary Refill_normal'] = 0.0
  

  #convert columns with str_ to str for model to read
  df.columns = df.columns.astype(str) 

  df = df.fillna(0.0)

  #Clean GCS
  df['GCS Total'] = df['GCS Total'].apply(fix_GCS)

  #change to float
  # convert the data in 'col1' from uint8 to float
  df['gender_Female'] = df['gender_Female'].astype('float')
  df['gender_Male'] = df['gender_Male'].astype('float')
  df['ethnicity_African American'] = df['ethnicity_African American'].astype('float')
  df['ethnicity_Asian'] = df['ethnicity_Asian'].astype('float')
  df['ethnicity_Caucasian'] = df['ethnicity_Caucasian'].astype('float')
  df['ethnicity_Hispanic'] = df['ethnicity_Hispanic'].astype('float')
  df['ethnicity_Native American'] = df['ethnicity_Native American'].astype('float')
  df['ethnicity_Other/Unknown'] = df['ethnicity_Other/Unknown'].astype('float')
  df['Capillary Refill_< 2 seconds'] = df['Capillary Refill_< 2 seconds'].astype('float')
  df['Capillary Refill_> 2 seconds'] = df['Capillary Refill_> 2 seconds'].astype('float')
  df['Capillary Refill_feet'] = df['Capillary Refill_feet'].astype('float')
  df['Capillary Refill_hands'] = df['Capillary Refill_hands'].astype('float')
  df['Capillary Refill_normal'] = df['Capillary Refill_normal'].astype('float')
  df['age'] = df['age'].astype('float')
  df['GCS Total'] = df['GCS Total'].astype('float')

  #don't be racist
  df.drop(columns = ['ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown'], inplace = True) 

  df.drop(columns = ['gender_Female','gender_Male'], inplace=True)

  data = df

  return data

# MODELING


In [79]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score, KFold
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [80]:
#test the model
x_train_path = 'train_x.csv'
df_train = load_data(x_train_path)
X = preprocess_x(df_train)

y = pd.read_csv('train_y.csv', index_col=1)
y = y.iloc[: , 1:]

#combine x and y based on id
df_model = pd.merge(X, y, left_index=True, right_index=True)

X = df_model.iloc[: , : df_model.shape[1]-1]
y = df_model.iloc[:,df_model.shape[1]-1]


  if (await self.run_code(code, result,  async_=asy)):


In [81]:
#################
#submission data
#################
x_test_path = 'test_x.csv'
df_test = load_data(x_test_path)
X_test_sub = preprocess_x(df_test)

#get the columns to be same order
cols = list(X.columns)
X_test_sub = X_test_sub.reindex(columns = cols)
X_sub_copy = X_test_sub

  if (await self.run_code(code, result,  async_=asy)):


In [82]:
#standardize
scaler = StandardScaler()
# Fit the scaler to the data
scaler.fit(X)
# Transform the training and testing data
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the scaler to the data
scaler.fit(X_test_sub)
# Transform the training and testing data
X_test_sub = scaler.transform(X_test_sub)


In [83]:
ada = AdaBoostClassifier()

#Hypertunining
param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500, 1000],
              'learning_rate': [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'algorithm': ['SAMME', 'SAMME.R']}

#train with grid search
grid = GridSearchCV(ada, param_grid, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                           0.8, 0.9, 1],
                         'n_estimators': [50, 100, 200, 300, 400, 500, 1000]})

In [84]:
grid.best_params_

{'algorithm': 'SAMME', 'learning_rate': 0.6, 'n_estimators': 500}

In [85]:
y_pred = grid.predict_proba(X_test)[:,1]
rocauc = roc_auc_score(y_test, y_pred)
print(rocauc)

0.884065934065934


In [86]:
y_pred_test = grid.predict_proba(X_test_sub)[:,1]
X_sub_copy['hospitaldischargestatus'] = y_pred_test
X_sub_copy = X_sub_copy.reset_index().rename(columns={'index': 'indexpatientunitstayid_col'})
output = pd.concat([X_sub_copy['patientunitstayid'], X_sub_copy['hospitaldischargestatus']], axis=1)
output["patientunitstayid"] = output["patientunitstayid"].astype(int)
print(output)

     patientunitstayid  hospitaldischargestatus
0               151179                 0.496186
1               151900                 0.455332
2               152954                 0.454302
3               158056                 0.421472
4               159411                 0.426532
..                 ...                      ...
499            3343295                 0.430637
500            3344944                 0.453348
501            3347502                 0.476903
502            3348293                 0.419819
503            3352230                 0.428027

[504 rows x 2 columns]


In [87]:
output.to_csv('output.csv', index = False)