In [1]:
# load cleaned data file (adm_ds2.csv) which is obtained after preprocessing

In [2]:
import pandas as pd
import numpy as np

import os, sys, math, csv, datetime, pickle, json, time
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load the libraries used for Machine Learning Analysis and training
import gc
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# load csv into a dataframe
df = pd.read_csv('adm_ds2.csv')

In [8]:
# separate features and TARGET columns
feature_df = df[df.columns[~df.columns.isin(['TARGET'])]]
target_df = df.TARGET

In [9]:
# get 20% of total data into test dataset (train test split)
train_features, test_features, train_target, test_target = train_test_split(feature_df, target_df, random_state=42, test_size=0.2)

In [10]:
del feature_df, target_df, df

In [11]:
gc.collect()

91

In [12]:
# use 5000 max tfidf features for the summary text
tfidf_summary = TfidfVectorizer(max_features=5000)

In [13]:
# use all the service tfidf feature
tfidf_service = TfidfVectorizer()

In [14]:
# get tfidf features of processed cleaned summary and service

In [15]:
summary_features = tfidf_summary.fit_transform(train_features['cleaned_text1'].fillna('')).toarray()

In [16]:
service_features = tfidf_service.fit_transform(train_features['service1']).toarray()

In [17]:
import pickle

In [19]:
# load train vectors for summary features (train_vectors_cleaned_text1_cbow.pkl)

In [21]:
# load train vectors from pickled file
with open('train_vectors_cleaned_text1_cbow.pkl', 'rb') as fin:
  train_feature_vectors = pickle.load(fin)

In [22]:
ohe_cols = [
    'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
    'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DESCRIPTION'
]
ohe_objects = {}
ohe_features = {}

# get one hot feature representation of above columns

for each_col in ohe_cols:
  ohe_objects[each_col] = OneHotEncoder(handle_unknown='ignore')
  ohe_features[each_col] = ohe_objects[each_col].fit_transform(train_features[each_col].values.reshape((-1, 1))).toarray()
  print(ohe_features[each_col].shape)

(42180, 4)
(42180, 9)
(42180, 16)
(42180, 5)
(42180, 69)
(42180, 19)
(42180, 7)
(42180, 41)
(42180, 2)


In [23]:
# use the combined one hot representation with other features obtained below
ohe_features_array = np.concatenate(list(ohe_features.values()), axis=1)

In [24]:
del ohe_features

In [25]:
# combine the float / numeric columns
float_cols = [
    'TIME_IN_EMERGENCY_WARD', 'in_failure', 'FOLLOW_UP', 'sentiment', 'subjectivty',
    'HOSPITAL_EXPIRE_FLAG', 'DAYS_IN_HOSPITAL'
]

float_features = train_features[float_cols].values

In [26]:
del train_features

In [27]:
# combine all the feature columns here to get complete training features
all_train_features = np.concatenate(
    [
     summary_features,
     service_features,
    #  diagnosis_features,
     ohe_features_array,
     float_features,
     train_feature_vectors
    ],
    axis=1
)

In [28]:
all_train_features.shape

(42180, 5566)

Create an XGBoost Classifier for training the features obtained above (combination of different features)

In [29]:
# create the xgboost classifier for training
xgb = XGBClassifier()

In [30]:
# train the XGBOost classifier on train dataset
xgb.fit(all_train_features, train_target)

XGBClassifier()

In [31]:
del all_train_features, train_target

In [32]:
gc.collect()

40

In [33]:
# transform test data to correct features - test data feature engineering

In [34]:
# get test data summaries features
test_summary_features = tfidf_summary.transform(test_features['cleaned_text1'].fillna('')).toarray()

In [35]:
del tfidf_summary

In [36]:
# get test data service features
test_service_features = tfidf_service.transform(test_features['service1']).toarray()

In [37]:
del tfidf_service

In [38]:
# ohe_cols = [
#     'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
#     'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DESCRIPTION'
# ]

test_ohe_features = {}

# get the test data one hot feature vectors

for each_col in ohe_cols:
  test_ohe_features[each_col] = ohe_objects[each_col].transform(test_features[each_col].values.reshape((-1, 1))).toarray()
  print(test_ohe_features[each_col].shape)

(10546, 4)
(10546, 9)
(10546, 16)
(10546, 5)
(10546, 69)
(10546, 19)
(10546, 7)
(10546, 41)
(10546, 2)


In [39]:
# combine the test data one hot feature representations
test_ohe_features_array = np.concatenate(list(test_ohe_features.values()), axis=1)

In [40]:
del test_ohe_features, ohe_objects

In [41]:
gc.collect()

180

In [42]:
# float_cols = [
#     'TIME_IN_EMERGENCY_WARD', 'in_failure', 'FOLLOW_UP', 'sentiment', 'subjectivty',
#     'HOSPITAL_EXPIRE_FLAG', 'DAYS_IN_HOSPITAL'
# ]

# get the numerics test data and combine them

test_float_features = test_features[float_cols].values

In [43]:
# load the test data feature vectors (test_vectors_cleaned_text1_cbow.pkl)

In [44]:
# load the test data feature vectors from its pickled file
with open('test_vectors_cleaned_text1_cbow.pkl', 'rb') as fin:
  test_feature_vectors = pickle.load(fin)

In [45]:
del test_features

In [46]:
# combine all the above test data feature vectors to prepare final test data features
all_test_features = np.concatenate(
    [
     test_summary_features,
     test_service_features,
    #  diagnosis_features,
     test_ohe_features_array,
     test_float_features,
     test_feature_vectors
    ],
    axis=1
)

In [47]:
# get predictions on test data

In [48]:
# get the probability scores of the test data features
test_pred_values = xgb.predict_proba(all_test_features)

In [49]:
from sklearn.metrics import roc_auc_score

In [50]:
# calculate AUROC score for the test dataset

In [51]:
roc_auc_score(test_target, test_pred_values[:, 1])

0.7310730318380335

Thus we obtain **0.7310730318380335** AUROC score which is 0.02 less than that of the State of the Art score.
