In [59]:
# load cleaned data file (adm_ds2.csv) which is obtained after preprocessing

In [60]:
import pandas as pd
import numpy as np
import os, sys, math, csv, datetime, pickle, json, time

In [61]:
# load the libraries used for Machine Learning Analysis and training
import gc
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
from sklearn.decomposition import TruncatedSVD

In [63]:
from sklearn.preprocessing import OneHotEncoder

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
from nltk.tokenize import word_tokenize

In [66]:
import nltk

In [67]:
# load csv into a dataframe
df = pd.read_csv('adm_ds2.csv')

In [68]:
# separate features and TARGET columns
feature_df = df[df.columns[~df.columns.isin(['TARGET'])]]
target_df = df.TARGET

In [69]:
# get 20% of total data into test dataset (train test split)
train_features, test_features, train_target, test_target = train_test_split(feature_df, target_df, random_state=42, test_size=0.2)

In [70]:
train_features.shape, test_features.shape, train_target.shape, test_target.shape

((42180, 41), (10546, 41), (42180,), (10546,))

In [71]:
del feature_df, target_df

In [72]:
# get maximum 3000 features of summary text
tfidf_summary = TfidfVectorizer(max_features=3000)

In [73]:
# use all the service tfidf feature
tfidf_service = TfidfVectorizer()

In [74]:
# get maximum 2000 features of diagnosis text
tfidf_diagnosis = TfidfVectorizer(max_features=2000)

In [75]:
# get tfidf features of processed cleaned summary and service and diagnosis

In [76]:
summary_features = tfidf_summary.fit_transform(train_features['cleaned_text'].fillna('')).toarray()

In [77]:
service_features = tfidf_service.fit_transform(train_features['service1']).toarray()

In [78]:
diagnosis_features = tfidf_diagnosis.fit_transform(train_features['DIAGNOSIS']).toarray()

In [79]:
# load train vectos for summary features (train_vectors_cleaned_text_cbow.pkl)

In [80]:
import pickle

# load train vectors from pickled file
with open('train_vectors_cleaned_text_cbow.pkl', 'rb') as fin:
  train_vectors = pickle.load(fin)

In [81]:
train_features.columns

Index(['Unnamed: 0', 'ROW_ID_x', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
       'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR',
       'TEXT', 'ROW_ID_y', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'NEXT_ADMITTIME', 'DAYS_IN_HOSPITAL',
       'DAYS_WITHIN_NEXT_ADMIT', 'TIME_IN_EMERGENCY_WARD',
       'DIFF_TIME_DEATH_DISCH', 'in_failure', 'FOLLOW_UP', 'service1',
       'cleaned_text', 'sentiment', 'subjectivty', 'cleaned_text1'],
      dtype='object')

In [82]:
ohe_cols = [
    'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
    'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DESCRIPTION'
]
ohe_objects = {}
ohe_features = {}

# create one-hot features, get one hot feature representation of above columns

for each_col in ohe_cols:
  ohe_objects[each_col] = OneHotEncoder(handle_unknown='ignore')
  ohe_features[each_col] = ohe_objects[each_col].fit_transform(train_features[each_col].values.reshape((-1, 1))).toarray()
  print(ohe_features[each_col].shape)

(42180, 4)
(42180, 9)
(42180, 16)
(42180, 5)
(42180, 69)
(42180, 19)
(42180, 7)
(42180, 41)
(42180, 2)


In [83]:
# use the combined one hot representation with other features obtained below
ohe_features_array = np.concatenate(list(ohe_features.values()), axis=1)

In [84]:
del ohe_features

In [85]:
ohe_features_array.shape

(42180, 172)

In [86]:
# combine the float / numeric columns
float_cols = [
    'TIME_IN_EMERGENCY_WARD', 'in_failure', 'FOLLOW_UP', 'sentiment', 'subjectivty',
    'HOSPITAL_EXPIRE_FLAG', 'DAYS_IN_HOSPITAL'
]

float_features = train_features[float_cols].values

In [87]:
del train_features

In [88]:
# combine all the feature columns here to get complete training features
all_train_features = np.concatenate(
    [
     summary_features,
     service_features,
    #  diagnosis_features,
     ohe_features_array,
     float_features,
     train_vectors
    ],
    axis=1
)

Create an XGBoost Classifier for training the features obtained above (combination of different features)

In [89]:
# create the xgboost classifier for training
xgb = XGBClassifier()

In [90]:
# train the XGBOost classifier on train dataset
xgb.fit(all_train_features, train_target)

XGBClassifier()

In [91]:
# del all_train_features, train_target

In [92]:
# transform test data to correct features - test data feature engineering

In [93]:
# get test data summaries features
test_summary_features = tfidf_summary.transform(test_features['cleaned_text'].fillna('')).toarray()

In [94]:
# get test data service features
test_service_features = tfidf_service.transform(test_features['service1']).toarray()

In [95]:
# get test data diagnosis features
test_diagnosis_features = tfidf_diagnosis.transform(test_features['DIAGNOSIS']).toarray()

In [96]:
# ohe_cols = [
#     'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
#     'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DESCRIPTION'
# ]

test_ohe_features = {}

# get the test data one hot feature vectors

for each_col in ohe_cols:
  test_ohe_features[each_col] = ohe_objects[each_col].transform(test_features[each_col].values.reshape((-1, 1))).toarray()
  print(test_ohe_features[each_col].shape)

(10546, 4)
(10546, 9)
(10546, 16)
(10546, 5)
(10546, 69)
(10546, 19)
(10546, 7)
(10546, 41)
(10546, 2)


In [97]:
# combine the test data one hot feature representations
test_ohe_features_array = np.concatenate(list(test_ohe_features.values()), axis=1)

In [98]:
del test_ohe_features

In [99]:
# float_cols = [
#     'TIME_IN_EMERGENCY_WARD', 'in_failure', 'FOLLOW_UP', 'sentiment', 'subjectivty',
#     'HOSPITAL_EXPIRE_FLAG', 'DAYS_IN_HOSPITAL'
# ]

test_float_features = test_features[float_cols].values

In [100]:
# get the numerics test data and combine them (test_vectors_cleaned_text_cbow.pkl)

In [101]:
# import pickle

# load the test data feature vectors from its pickled file

with open('test_vectors_cleaned_text_cbow.pkl', 'rb') as fin:
  test_vectors = pickle.load(fin)

In [102]:
del test_features

In [103]:
# combine all the above test data feature vectors to prepare final test data features
all_test_features = np.concatenate(
    [
     test_summary_features,
     test_service_features,
    #  diagnosis_features,
     test_ohe_features_array,
     test_float_features,
     test_vectors
    ],
    axis=1
)

get predictions on test data

In [104]:
# get predictions on test data
# get the probability scores of the test data features
test_pred_values = xgb.predict_proba(all_test_features)

In [105]:
from sklearn.metrics import roc_auc_score

In [106]:
# calculate AUROC score for the test dataset
roc_auc_score(test_target, test_pred_values[:, 1])

0.7265264518695307

We obtain an AUROC score of **0.7265264518695307** using XGBoost classfier with given feature settings

In [107]:
# use diagnosis features for training on LR, Naive Bayes
all_train_features = np.concatenate(
    [
     summary_features,
     service_features,
     diagnosis_features,
     ohe_features_array,
     float_features,
     train_vectors
    ],
    axis=1
)

In [108]:
# train features with the diganosis features as well
all_train_features.shape

(42180, 5566)

In [109]:
from sklearn.linear_model import LogisticRegression

In [110]:
# get the Logistic Regression model
lr = LogisticRegression(random_state=42)

In [111]:
# train the Logistic Regression model
lr.fit(all_train_features, train_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [112]:
# prepare the test data feature including the diagnosis features as well
all_test_features = np.concatenate(
    [
     test_summary_features,
     test_service_features,
     test_diagnosis_features,
     test_ohe_features_array,
     test_float_features,
     test_vectors
    ],
    axis=1
)

In [113]:
test_service_features.shape

(10546, 287)

In [114]:
# do predictions on the Logistic Regression trained
lr_preds = lr.predict_proba(all_test_features)

In [115]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_target, lr_preds[:, 1])

0.7167985810075819

We obtain a score of **0.7167985810075819** using Logistic Regression method using the given features

In [116]:
# train a decision tree classifier using the above features only
from sklearn.tree import DecisionTreeClassifier

In [117]:
# train a DecisionTree Classifier
dtc = DecisionTreeClassifier(random_state=42)

In [118]:
dtc.fit(all_train_features, train_target)

DecisionTreeClassifier(random_state=42)

In [119]:
# get the probability of the test features on the decision tree classifier
dtc_preds = dtc.predict_proba(all_test_features)

In [120]:
# check AUROC score on DecisionTreeClassifier using above features
roc_auc_score(test_target, dtc_preds[:, 1])

0.5196631516568441

In [121]:
dtc.feature_importances_

array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
       0.00028606])

In [122]:
# AUROC score of the mean of the predictions from Decision Tree and Logistic Regression
roc_auc_score(test_target, np.mean([dtc_preds, lr_preds], axis=0)[:, 1])

0.7100267075804069

In [123]:
# train Bernoulli Naive Bayes model on the same features as of Lr, and Decision Tree
from sklearn.naive_bayes import BernoulliNB

In [124]:
bnb = BernoulliNB()

In [125]:
bnb.fit(all_train_features, train_target)

BernoulliNB()

In [126]:
# get the probability score of the Bernoulli Naive Bayes classifier
bnb_preds = bnb.predict_proba(all_test_features)

In [127]:
# check AUROC score on Bernoulli Naive Bayes using above features
roc_auc_score(test_target, bnb_preds[:, 1])

0.6423762643482214

In [128]:
roc_auc_score(test_target, np.mean([dtc_preds, lr_preds, bnb_preds], axis=0)[:, 1])

0.6948504700209439

Hence we conclude that XGBoost works the best among different classifier tried and different feature engineering including