In [1]:
# load cleaned data file (adm_ds2.csv) which is obtained after preprocessing

In [2]:
import pandas as pd
import numpy as np

import os, sys, math, csv, datetime, pickle, json, time
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load the libraries used for Machine Learning Analysis and training
import gc
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# load csv into a dataframe
df = pd.read_csv('adm_ds2.csv')

In [8]:
df.columns

Index(['Unnamed: 0', 'ROW_ID_x', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
       'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR',
       'TEXT', 'ROW_ID_y', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'NEXT_ADMITTIME', 'DAYS_IN_HOSPITAL',
       'DAYS_WITHIN_NEXT_ADMIT', 'TIME_IN_EMERGENCY_WARD',
       'DIFF_TIME_DEATH_DISCH', 'TARGET', 'in_failure', 'FOLLOW_UP',
       'service1', 'cleaned_text', 'sentiment', 'subjectivty',
       'cleaned_text1'],
      dtype='object')

In [9]:
# separate features and TARGET columns
feature_df = df[df.columns[~df.columns.isin(['TARGET'])]]
target_df = df.TARGET

In [10]:
# get 20% of total data into test dataset (train test split)
train_features, test_features, train_target, test_target = train_test_split(feature_df, target_df, random_state=42, test_size=0.2)

In [11]:
train_features.shape, test_features.shape, train_target.shape, test_target.shape

((42180, 41), (10546, 41), (42180,), (10546,))

In [12]:
del feature_df, target_df, df

In [13]:
gc.collect()

131

In [14]:
# here we are using 5000 max feature for the summary text
tfidf_summary = TfidfVectorizer(max_features=5000)

In [15]:
# use all the service tfidf feature
tfidf_service = TfidfVectorizer()

In [16]:
summary_features = tfidf_summary.fit_transform(train_features['cleaned_text'].fillna('')).toarray()

In [17]:
service_features = tfidf_service.fit_transform(train_features['service1']).toarray()

In [18]:
(train_features['cleaned_text'] == '').value_counts()

False    42180
Name: cleaned_text, dtype: int64

In [19]:
import pickle

In [1]:
# load train vectors for summary features (train_vectors_cleaned_text_cbow.pkl)

In [22]:
# load train vectors from pickled file
with open('train_vectors_cleaned_text_cbow.pkl', 'rb') as fin:
  train_feature_vectors = pickle.load(fin)

In [23]:
train_features.columns

Index(['Unnamed: 0', 'ROW_ID_x', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
       'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR',
       'TEXT', 'ROW_ID_y', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'NEXT_ADMITTIME', 'DAYS_IN_HOSPITAL',
       'DAYS_WITHIN_NEXT_ADMIT', 'TIME_IN_EMERGENCY_WARD',
       'DIFF_TIME_DEATH_DISCH', 'in_failure', 'FOLLOW_UP', 'service1',
       'cleaned_text', 'sentiment', 'subjectivty', 'cleaned_text1'],
      dtype='object')

In [24]:
ohe_cols = [
    'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
    'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DESCRIPTION'
]
ohe_objects = {}
ohe_features = {}

# get one hot features for teh above columns only. 

for each_col in ohe_cols:
  ohe_objects[each_col] = OneHotEncoder(handle_unknown='ignore')
  ohe_features[each_col] = ohe_objects[each_col].fit_transform(train_features[each_col].values.reshape((-1, 1))).toarray()
  print(ohe_features[each_col].shape)

(42180, 4)
(42180, 9)
(42180, 16)
(42180, 5)
(42180, 69)
(42180, 19)
(42180, 7)
(42180, 41)
(42180, 2)


In [25]:
# use the combined one hot representation with other features obtained below
ohe_features_array = np.concatenate(list(ohe_features.values()), axis=1)

In [26]:
del ohe_features

In [27]:
ohe_features_array.shape

(42180, 172)

In [28]:
# combine the float / numeric columns
float_cols = [
    'TIME_IN_EMERGENCY_WARD', 'in_failure', 'FOLLOW_UP', 'sentiment', 'subjectivty',
    'HOSPITAL_EXPIRE_FLAG', 'DAYS_IN_HOSPITAL'
]

float_features = train_features[float_cols].values

In [29]:
del train_features

In [30]:
# combing all the features obtained above to get complete training features
all_train_features = np.concatenate(
    [
     summary_features,
     service_features,
    # diagnosis_features,
     ohe_features_array,
     float_features,
     train_feature_vectors
    ],
    axis=1
)

In [31]:
all_train_features.shape

(42180, 5566)

In [32]:
# summary_features.shape, service_features.shape, diagnosis_features.shape, ohe_features_array.shape, float_features.shape

In [33]:
# create the xgboost classifier for training
xgb = XGBClassifier()

In [None]:
# train the XGBoost classifier on train dataset
xgb.fit(all_train_features, train_target)

In [None]:
del all_train_features, train_target

In [None]:
# to manage memory in Notebook
gc.collect()

In [None]:
# transform test data to correct features - test data feature engineering

In [None]:
# get test data summaries features
test_summary_features = tfidf_summary.transform(test_features['cleaned_text'].fillna('')).toarray()

In [None]:
del tfidf_summary

In [None]:
# get test data service features
test_service_features = tfidf_service.transform(test_features['service1']).toarray()

In [None]:
del tfidf_service

In [None]:
# ohe_cols = [
#     'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
#     'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DESCRIPTION'
# ]

test_ohe_features = {}

# get the test data one hot feature vectors

for each_col in ohe_cols:
  test_ohe_features[each_col] = ohe_objects[each_col].transform(test_features[each_col].values.reshape((-1, 1))).toarray()
  print(test_ohe_features[each_col].shape)

(10546, 4)
(10546, 9)
(10546, 16)
(10546, 5)
(10546, 69)
(10546, 19)
(10546, 7)
(10546, 41)
(10546, 2)


In [None]:
# combine the test data one hot feature representations
test_ohe_features_array = np.concatenate(list(test_ohe_features.values()), axis=1)

In [None]:
del test_ohe_features, ohe_objects

In [None]:
gc.collect()

158

In [None]:
# float_cols = [
#     'TIME_IN_EMERGENCY_WARD', 'in_failure', 'FOLLOW_UP', 'sentiment', 'subjectivty',
#     'HOSPITAL_EXPIRE_FLAG', 'DAYS_IN_HOSPITAL'
# ]

# get the numerics test data and combine them

test_float_features = test_features[float_cols].values

In [None]:
# load the test data feature vectors (test_vectors_cleaned_text_cbow.pkl)

In [None]:
# load the test data feature vectors from its pickled file
with open('test_vectors_cleaned_text_cbow.pkl', 'rb') as fin:
  test_feature_vectors = pickle.load(fin)

In [None]:
del test_features

In [None]:
# combine all the above test data feature vectors to prepare final test data features
all_test_features = np.concatenate(
    [
     test_summary_features,
     test_service_features,
    #  diagnosis_features,
     test_ohe_features_array,
     test_float_features,
     test_feature_vectors
    ],
    axis=1
)

In [None]:
# get predictions on test data

In [None]:
# get the probability scores of the test data features
test_pred_values = xgb.predict_proba(all_test_features)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
# calculate AUROC score for the test dataset

In [None]:
roc_auc_score(test_target, test_pred_values[:, 1])

0.7338059925640901

Thus we obtain **0.7338059925640901** AUROC score which is 0.02 less than that of the State of the Art score.