In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


train_df = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
test_df = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')

# drop features that have no use for the task
train_df = train_df.drop(['case_id'], axis=1)
train_df = train_df.drop(['Admission_Deposit'], axis=1)
test_df = test_df.drop(['case_id'], axis=1)
test_df = test_df.drop(['Admission_Deposit'], axis=1)

# remove null values
train_df = train_df.dropna()
test_df = test_df.dropna()

# folds in repeated patient ids instances  into N number of visits 
train_df['Visits'] = train_df.groupby(['patientid'])['patientid'].transform('count')
train_df = train_df.drop(['patientid'], axis=1) 
test_df['Visits'] = test_df.groupby(['patientid'])['patientid'].transform('count')
test_df.drop(['patientid'], axis=1, inplace = True)
test_df = test_df.reset_index(drop = True)


# create a larger dataset to work with
# train_df['Dataset_source'] = 'train'
# test_df['Dataset_source'] = 'test'
# combined_df = pd.concat([train_df, test_df])

In [2]:
def normalize_column(dataframe, column, threshold):

    normalized_values = dataframe[column].value_counts(normalize= True)
    values_below_threshold = normalized_values.where(normalized_values < threshold).dropna().index.values
    dataframe[column] = np.where(dataframe[column].isin(values_below_threshold), '0', dataframe[column])
    dataframe.reset_index(drop = True)
    

normalize_column(train_df, 'Hospital_code', 0.05)
normalize_column(train_df, 'City_Code_Patient', 0.05)

# prepare columns to ease the encoding of categorical values
target=['Hospital_code', 'City_Code_Hospital',
        'City_Code_Patient', 'Severity of Illness',
        'Hospital_type_code', 'Hospital_region_code',
        'Department', 'Type of Admission', 'Stay',
        'Ward_Facility_Code',  'Ward_Type', 'Age', 'Bed Grade'] 
train_df[target] = train_df[target].astype('category')

# Encoding

In [3]:
def integer_encode_columns(dataframe, column_list):
    encoder = LabelEncoder()
    df = dataframe.copy()
    for column in column_list:
        col_name = column + '_lbl'
        df[col_name] = encoder.fit_transform(df[column])
    
    df = df.drop(column_list, axis = 1)
    df = df.reset_index(drop = True)
    return df


encode_targets = ['Hospital_code','Hospital_type_code','City_Code_Hospital', 
                  'Hospital_region_code','Department','Ward_Type','Ward_Facility_Code',
                  'City_Code_Patient', 'City_Code_Hospital', 'Type of Admission', 
                  'Severity of Illness', 'Age', 'Stay', 'Bed Grade']
encoded_df = integer_encode_columns(train_df, encode_targets)

# note-toself: in future, use for-loop or something fancier
encoded_df.rename(columns = {'Hospital_code_lbl':'Hospital_code', 
                             'Hospital_type_code_lbl':'Hospital_type_code',
                             'City_Code_Hospital_lbl':'City_Code_Hospital',
                            'Hospital_region_code_lbl':'Hospital_region_code',
                            'Department_lbl': 'Department',
                             'Ward_Type_lbl': 'Ward_Type',
                            'Ward_Facility_Code_lbl': 'Ward_Facility_Code', 
                            'City_Code_Patient_lbl': 'City_Code_Patient',
                            'City_Code_Hospital_lbl': 'City_Code_Hospital',
                            'Type of Admission_lbl': 'Admission_Type',
                            'Severity of Illness_lbl': 'Illness_Severity',
                            'Age_lbl': 'Age', 'Stay_lbl': 'Stay', 
                            'Bed Grade_lbl': 'Bed_Grade'}, inplace = True)

In [4]:
def binary_encode_column(dataframe, column, inplace = True):
    if inplace:
        dataframe.loc[dataframe[column]!= 0] = 1
        dataframe = dataframe.reset_index(drop = True)
        return
    else:
        df = dataframe.copy()
        df.loc[df[column]!= 0]= 1
        df = df.reset_index(drop = True)
        return df
    
binary_encode_column(encoded_df, 'Stay') 

# Training

In [5]:
target = encoded_df.pop('Stay')

In [6]:
numeric_features = encoded_df.copy()
tf.convert_to_tensor(numeric_features)
# creates a normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(numeric_features)
# normalize all rows
normalizer(numeric_features.iloc[:])


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

<tf.Tensor: shape=(313793, 15), dtype=float32, numpy=
array([[ 2.7054884 ,  1.8982663 ,  8.850588  , ..., -4.662241  ,
         4.135621  , -0.1507379 ],
       [-0.24824435, -0.24902238, -0.22416845, ..., -0.12707493,
        -0.23466481, -0.1507379 ],
       [-0.24824435, -0.24902238, -0.22416845, ..., -0.12707493,
        -0.23466481, -0.1507379 ],
       ...,
       [-0.24824435, -0.24902238, -0.22416845, ..., -0.12707493,
        -0.23466481, -0.1507379 ],
       [-0.24824435, -0.24902238, -0.22416845, ..., -0.12707493,
        -0.23466481, -0.1507379 ],
       [ 5.6592216 ,  1.8982663 , -0.22416845, ..., -0.12707493,
        -0.23466481, -0.1507379 ]], dtype=float32)>

# Model

In [7]:
def get_basic_model():
    model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='relu' )
  ])
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [8]:
model = get_basic_model()
model.summary()
model.fit(numeric_features, target, epochs=15, batch_size=500)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 15)                31        
_________________________________________________________________
dense (Dense)                (None, 128)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 130       
Total params: 26,977
Trainable params: 26,946
Non-trainable params: 31
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 

<keras.callbacks.History at 0x7f83b5e93250>

In [9]:
from keras.models import load_model
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

In [10]:
# training_set, validation_set, test_set = np.split(encoded_df.sample(frac=1), [int(0.7*len(encoded_df)), int(0.85*len(encoded_df))])
# print(f'Length of Training set is {len(training_set)}')
# print(f'Length of Validation set is {len(validation_set)}')
# print(f'Length of Test set is {len(test_set)}')
# print(f'Shape of Training set is {training_set.shape}')
# print(f'Shape of Validation set is {training_set.shape}')
# print(f'Shape of Test set is {training_set.shape}')

In [11]:
# alternative model to tackle this single-label, binary classification problem
# model = Sequential()
# model.add(Dense(128, input_dim = training_set.shape[1] , activation = 'relu'))
# model.add(Dense(128, activation = 'relu'))
# model.add(Dense(128, activation = 'relu'))
# model.add(Dense(128, activation = 'relu'))
# model.add(Dense(2, activation = 'softmax'))

# model.summary()
# model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# model.fit(training_set,validation_set,validation_split = 0.2, batch_size = 225, epochs = 5, shuffle = True, verbose = 2)