# Package Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_selector

from sklearn.metrics import classification_report,roc_auc_score,ConfusionMatrixDisplay,f1_score,accuracy_score
from sklearn.exceptions import NotFittedError

import tensorflow as tf

sns.set_style('darkgrid')

%matplotlib inline

# Data Import

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nehaprabhavalkar/av-healthcare-analytics-ii")

print("Path to dataset files:", path)

input_file=path + '/healthcare/train_data_dictionary.csv'
df_des=pd.read_csv(input_file)
df_des

Path to dataset files: /Users/markmerkouchev/.cache/kagglehub/datasets/nehaprabhavalkar/av-healthcare-analytics-ii/versions/1


Unnamed: 0,Column,Description
0,case_id,Case_ID registered in Hospital
1,Hospital_code,Unique code for the Hospital
2,Hospital_type_code,Unique code for the type of Hospital
3,City_Code_Hospital,City Code of the Hospital
4,Hospital_region_code,Region Code of the Hospital
5,Available Extra Rooms in Hospital,Number of Extra rooms available in the Hospital
6,Department,Department overlooking the case
7,Ward_Type,Code for the Ward type
8,Ward_Facility_Code,Code for the Ward Facility
9,Bed Grade,Condition of Bed in the Ward


In [4]:
input_file= path + '/healthcare/train_data.csv'
df=pd.read_csv(input_file)
df.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [5]:
target='Stay'
target

'Stay'

In [6]:
col_drop_list=[]
col_drop_list.append('patientid')

# EDA (Describe,Info)

In [7]:
df.describe()

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Admission_Deposit
count,318438.0,318438.0,318438.0,318438.0,318325.0,318438.0,313906.0,318438.0,318438.0
mean,159219.5,18.318841,4.771717,3.197627,2.625807,65747.579472,7.251859,3.284099,4880.749392
std,91925.276847,8.633755,3.102535,1.168171,0.873146,37979.93644,4.745266,1.764061,1086.776254
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1800.0
25%,79610.25,11.0,2.0,2.0,2.0,32847.0,4.0,2.0,4186.0
50%,159219.5,19.0,5.0,3.0,3.0,65724.5,8.0,3.0,4741.0
75%,238828.75,26.0,7.0,4.0,3.0,98470.0,8.0,4.0,5409.0
max,318438.0,32.0,13.0,24.0,4.0,131624.0,38.0,32.0,11008.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3139

# Null Check

In [9]:
plt.figure(figsize=(15,6))
sns.heatmap(df.isna(),yticklabels=False)
plt.tight_layout()

In [10]:
df.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [11]:
df=df.dropna()

In [12]:
df.isna().sum()

case_id                              0
Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313793 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            313793 non-null  int64  
 1   Hospital_code                      313793 non-null  int64  
 2   Hospital_type_code                 313793 non-null  object 
 3   City_Code_Hospital                 313793 non-null  int64  
 4   Hospital_region_code               313793 non-null  object 
 5   Available Extra Rooms in Hospital  313793 non-null  int64  
 6   Department                         313793 non-null  object 
 7   Ward_Type                          313793 non-null  object 
 8   Ward_Facility_Code                 313793 non-null  object 
 9   Bed Grade                          313793 non-null  float64
 10  patientid                          313793 non-null  int64  
 11  City_Code_Patient                  313793 no

# Duplicate Check

In [14]:
# df=df.drop_duplicates()

In [15]:
# df.info()

# Feature Investigation

In [16]:
df.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313793 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            313793 non-null  int64  
 1   Hospital_code                      313793 non-null  int64  
 2   Hospital_type_code                 313793 non-null  object 
 3   City_Code_Hospital                 313793 non-null  int64  
 4   Hospital_region_code               313793 non-null  object 
 5   Available Extra Rooms in Hospital  313793 non-null  int64  
 6   Department                         313793 non-null  object 
 7   Ward_Type                          313793 non-null  object 
 8   Ward_Facility_Code                 313793 non-null  object 
 9   Bed Grade                          313793 non-null  float64
 10  patientid                          313793 non-null  int64  
 11  City_Code_Patient                  313793 no

In [18]:
for col in df.columns:
    print("-"*80)
    print(df[col].value_counts())
print("-"*80)

--------------------------------------------------------------------------------
case_id
1         1
211830    1
211837    1
211836    1
211835    1
         ..
105690    1
105689    1
105688    1
105687    1
318438    1
Name: count, Length: 313793, dtype: int64
--------------------------------------------------------------------------------
Hospital_code
26    32681
23    26112
19    21035
6     20016
11    17107
28    16947
14    16917
27    14109
9     11381
12    11149
29    11092
32    10561
25     9642
10     9271
15     9134
21     8047
24     7835
3      7030
17     5373
1      5188
13     5178
5      5143
2      5050
30     4917
22     4237
31     3932
16     3631
8      3611
18     3592
20     1390
7      1269
4      1216
Name: count, dtype: int64
--------------------------------------------------------------------------------
Hospital_type_code
a    141071
b     68125
c     45352
e     24286
d     20161
f     10561
g      4237
Name: count, dtype: int64
----------------------

**Target unique values**

In [19]:
df[target].nunique()

11

In [20]:
col_drop_list.append('case_id')
col_drop_list

['patientid', 'case_id']

# Col drop

In [21]:
df=df.drop(col_drop_list,axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313793 entries, 0 to 318437
Data columns (total 16 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Hospital_code                      313793 non-null  int64  
 1   Hospital_type_code                 313793 non-null  object 
 2   City_Code_Hospital                 313793 non-null  int64  
 3   Hospital_region_code               313793 non-null  object 
 4   Available Extra Rooms in Hospital  313793 non-null  int64  
 5   Department                         313793 non-null  object 
 6   Ward_Type                          313793 non-null  object 
 7   Ward_Facility_Code                 313793 non-null  object 
 8   Bed Grade                          313793 non-null  float64
 9   City_Code_Patient                  313793 non-null  float64
 10  Type of Admission                  313793 non-null  object 
 11  Severity of Illness                313793 no

# EDA Corr

In [22]:
#plt.figure(figsize=(20,8))
#sns.heatmap(df.corr(),cmap='cividis',annot=True)
#plt.tight_layout()

# Feature Transformation & Segregation

In [23]:
def feature_type_transform(df,col_list,col_type):
    for col in col_list:
        df[col]=df[col].astype(col_type)
    return df

In [24]:
def col_unique_val_check(df):
    for i,col in enumerate(df.columns):
        print(f"{col:40} ----> {df[col].nunique():10} unique values   with dtype {str(df[col].dtype):10} at index {i}")  

In [25]:
def num_col_unique_val_check(df,target):
    for i,col in enumerate(df.select_dtypes(exclude='object')):
        if(col!=target):
            print(f"{col:40} ----> {df[col].nunique():10} unique values   with dtype {str(df[col].dtype):10}")

**Check Unique val**

In [26]:
col_unique_val_check(df)

Hospital_code                            ---->         32 unique values   with dtype int64      at index 0
Hospital_type_code                       ---->          7 unique values   with dtype object     at index 1
City_Code_Hospital                       ---->         11 unique values   with dtype int64      at index 2
Hospital_region_code                     ---->          3 unique values   with dtype object     at index 3
Available Extra Rooms in Hospital        ---->         18 unique values   with dtype int64      at index 4
Department                               ---->          5 unique values   with dtype object     at index 5
Ward_Type                                ---->          6 unique values   with dtype object     at index 6
Ward_Facility_Code                       ---->          6 unique values   with dtype object     at index 7
Bed Grade                                ---->          4 unique values   with dtype float64    at index 8
City_Code_Patient                    

In [27]:
num_col_unique_val_check(df,target)

Hospital_code                            ---->         32 unique values   with dtype int64     
City_Code_Hospital                       ---->         11 unique values   with dtype int64     
Available Extra Rooms in Hospital        ---->         18 unique values   with dtype int64     
Bed Grade                                ---->          4 unique values   with dtype float64   
City_Code_Patient                        ---->         37 unique values   with dtype float64   
Visitors with Patient                    ---->         28 unique values   with dtype int64     
Admission_Deposit                        ---->       7283 unique values   with dtype float64   


## **Cat col creation**

In [28]:
cat_col=[]
for col in df.select_dtypes(include='object'):
    if(df[col].nunique()<30 and col!=target):
        print(f"{col:30} ----> {df[col].nunique():10} unique values")
        cat_col.append(col)

Hospital_type_code             ---->          7 unique values
Hospital_region_code           ---->          3 unique values
Department                     ---->          5 unique values
Ward_Type                      ---->          6 unique values
Ward_Facility_Code             ---->          6 unique values
Type of Admission              ---->          3 unique values
Severity of Illness            ---->          3 unique values
Age                            ---->         10 unique values


## **Num cat col creation**

In [29]:
num_cat_col=[]
for col in df.select_dtypes(exclude='object'):
    if(df[col].nunique()<40 and col!=target):
        print(f"{col:40} ----> {df[col].nunique():10} unique values")
        num_cat_col.append(col)

Hospital_code                            ---->         32 unique values
City_Code_Hospital                       ---->         11 unique values
Available Extra Rooms in Hospital        ---->         18 unique values
Bed Grade                                ---->          4 unique values
City_Code_Patient                        ---->         37 unique values
Visitors with Patient                    ---->         28 unique values


## **Num col creation**

In [30]:
num_col=['Admission_Deposit']
print(f"{num_col[0]:40} ----> {df[num_col[0]].nunique():10} unique values")

Admission_Deposit                        ---->       7283 unique values


**Feature type transform**

In [31]:
df=feature_type_transform(df,num_cat_col,'int64')

In [32]:
num_col_unique_val_check(df,target)

Hospital_code                            ---->         32 unique values   with dtype int64     
City_Code_Hospital                       ---->         11 unique values   with dtype int64     
Available Extra Rooms in Hospital        ---->         18 unique values   with dtype int64     
Bed Grade                                ---->          4 unique values   with dtype int64     
City_Code_Patient                        ---->         37 unique values   with dtype int64     
Visitors with Patient                    ---->         28 unique values   with dtype int64     
Admission_Deposit                        ---->       7283 unique values   with dtype float64   


In [33]:
print(df.columns.values)

['Hospital_code' 'Hospital_type_code' 'City_Code_Hospital'
 'Hospital_region_code' 'Available Extra Rooms in Hospital' 'Department'
 'Ward_Type' 'Ward_Facility_Code' 'Bed Grade' 'City_Code_Patient'
 'Type of Admission' 'Severity of Illness' 'Visitors with Patient' 'Age'
 'Admission_Deposit' 'Stay']


In [34]:
print(cat_col,"\n",num_cat_col,"\n",num_col)

['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age'] 
 ['Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital', 'Bed Grade', 'City_Code_Patient', 'Visitors with Patient'] 
 ['Admission_Deposit']


In [35]:
len(df.columns)

16

In [36]:
len(cat_col),len(num_cat_col),len(num_col),[len(cat_col)+len(num_cat_col)+len(num_col)]

(8, 6, 1, [15])

# EDA

In [37]:
plt.figure(figsize=(12,4))
sns.countplot(x=target,data=df,palette='tab10')
plt.tight_layout()

In [38]:
plt.figure(figsize=(20,18))
j=1
for i,columns in enumerate(num_cat_col):
    plt.subplot(3,2,j)
    sns.countplot(x=columns,data=df,palette='deep')
    j+=1
plt.tight_layout()

In [39]:
plt.figure(figsize=(20,18))
j=1
for i,columns in enumerate(cat_col):
    plt.subplot(4,2,j)
    sns.countplot(x=columns,data=df,palette='deep')
    j+=1
plt.tight_layout()

In [40]:
plt.figure(figsize=(18,18))
j=1
for i,columns in enumerate(['Bed Grade','Severity of Illness','Type of Admission','Department','Hospital_region_code','Age']):
    plt.subplot(3,2,j)
    sns.countplot(x=columns,data=df,palette='deep',hue=target)
    j+=1
plt.tight_layout()

In [41]:
plt.figure(figsize=(12,4))
sns.histplot(x=num_col[0],hue=target,data=df,palette='tab10')
plt.tight_layout()

# Data preprocess pipeline

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313793 entries, 0 to 318437
Data columns (total 16 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Hospital_code                      313793 non-null  int64  
 1   Hospital_type_code                 313793 non-null  object 
 2   City_Code_Hospital                 313793 non-null  int64  
 3   Hospital_region_code               313793 non-null  object 
 4   Available Extra Rooms in Hospital  313793 non-null  int64  
 5   Department                         313793 non-null  object 
 6   Ward_Type                          313793 non-null  object 
 7   Ward_Facility_Code                 313793 non-null  object 
 8   Bed Grade                          313793 non-null  int64  
 9   City_Code_Patient                  313793 non-null  int64  
 10  Type of Admission                  313793 non-null  object 
 11  Severity of Illness                313793 no

In [43]:
y_oh=OneHotEncoder()
l_eh=LabelEncoder()

oh=OneHotEncoder(handle_unknown='ignore')
ms,ss=MinMaxScaler(),StandardScaler()

ct=ColumnTransformer([
                        ('cat_encoder',oh,cat_col),
                        ('num_encoder',ms,num_col) ],
                        remainder='passthrough',n_jobs=-1)

ct

# Separate X and Y

In [44]:
X=df.drop([target],axis=1)
y=df[[target]]
X.head(2)

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,8,c,3,Z,3,radiotherapy,R,F,2,7,Emergency,Extreme,2,51-60,4911.0
1,2,c,5,Z,2,radiotherapy,S,F,2,7,Trauma,Extreme,2,51-60,5954.0


In [45]:
y.head(2)

Unnamed: 0,Stay
0,0-10
1,41-50


# Train Test Split

In [46]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,stratify=y,random_state=42)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,stratify=y_train,random_state=42)

In [47]:
X_train.shape,y_train.shape

((254171, 15), (254171, 1))

In [48]:
X_test.shape,y_test.shape

((31380, 15), (31380, 1))

In [49]:
X_test.head(2)

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
28209,29,a,4,X,2,gynecology,S,F,1,9,Trauma,Extreme,4,71-80,5201.0
250796,29,a,4,X,2,anesthesia,R,F,2,25,Emergency,Moderate,2,61-70,5655.0


In [50]:
y_test.head(2)

Unnamed: 0,Stay
28209,71-80
250796,11-20


# Data Preprocess

## **X related processing**

In [51]:
X_train=ct.fit_transform(X_train)
X_val=ct.fit_transform(X_val)
X_test=ct.transform(X_test)

In [52]:
print("-"*50)
for i in ct.transformers_:
    print(i)
    if(i[0]!='remainder'):
        print(i[1].get_feature_names_out())   
    print("-"*50)

--------------------------------------------------
('cat_encoder', OneHotEncoder(handle_unknown='ignore'), ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age'])
['Hospital_type_code_a' 'Hospital_type_code_b' 'Hospital_type_code_c'
 'Hospital_type_code_d' 'Hospital_type_code_e' 'Hospital_type_code_f'
 'Hospital_type_code_g' 'Hospital_region_code_X' 'Hospital_region_code_Y'
 'Hospital_region_code_Z' 'Department_TB & Chest disease'
 'Department_anesthesia' 'Department_gynecology' 'Department_radiotherapy'
 'Department_surgery' 'Ward_Type_P' 'Ward_Type_Q' 'Ward_Type_R'
 'Ward_Type_S' 'Ward_Type_T' 'Ward_Type_U' 'Ward_Facility_Code_A'
 'Ward_Facility_Code_B' 'Ward_Facility_Code_C' 'Ward_Facility_Code_D'
 'Ward_Facility_Code_E' 'Ward_Facility_Code_F'
 'Type of Admission_Emergency' 'Type of Admission_Trauma'
 'Type of Admission_Urgent' 'Severity of Illness_Extreme'
 'Severity of Illness_Minor' 'Sever

In [53]:
ct

## **Y related processing**

In [54]:
y_train.value_counts()

Stay              
21-30                 69915
11-20                 62446
31-40                 43993
51-60                 27915
0-10                  18833
41-50                  9348
71-80                  8177
More than 100 Days     5304
81-90                  3857
91-100                 2198
61-70                  2185
Name: count, dtype: int64

In [55]:
y_oh.fit(y_train)
output_shape=len(y_oh.get_feature_names_out())

y_train_copy=l_eh.fit_transform(y_train)
y_val_copy=l_eh.transform(y_val)
y_test_copy=l_eh.transform(y_test)

y_train_tf_copy = tf.keras.utils.to_categorical(y_train_copy, num_classes=output_shape)
y_val_tf_copy = tf.keras.utils.to_categorical(y_val_copy, num_classes=output_shape)
y_test_tf_copy = tf.keras.utils.to_categorical(y_test_copy, num_classes=output_shape)

In [56]:
dict(enumerate(l_eh.classes_))

{0: '0-10',
 1: '11-20',
 2: '21-30',
 3: '31-40',
 4: '41-50',
 5: '51-60',
 6: '61-70',
 7: '71-80',
 8: '81-90',
 9: '91-100',
 10: 'More than 100 Days'}

In [57]:
X_train.shape,y_train.shape

((254171, 50), (254171, 1))

In [58]:
X_test.shape,y_test.shape

((31380, 50), (31380, 1))

In [59]:
tup1=None;tup2=None
try:
    tup1=(X_test[0].toarray(),y_test_tf_copy[0]) 
    print("Sparse Matrix to Dense Array")
except:
    tup2=(X_test[0],y_test_tf_copy[0]) 
    print("Normal Matrix to Dense Array")

tup1 if(tup1) else tup2

Normal Matrix to Dense Array


(array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.38016622, 29.        ,
         4.        ,  2.        ,  1.        ,  9.        ,  4.        ]),
 array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]))

# Tensorflow batch settings

In [60]:
batch_size=1024

In [61]:
# def convert_sparse_matrix_to_sparse_tensor(X):
#     coo = X.tocoo()
#     indices = np.mat([coo.row, coo.col]).transpose()
#     return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

# X_train=convert_sparse_matrix_to_sparse_tensor(X_train)
# X_val=convert_sparse_matrix_to_sparse_tensor(X_val)
# X_test=convert_sparse_matrix_to_sparse_tensor(X_test)

In [62]:
data_tf_tr=tf.data.Dataset.from_tensor_slices((X_train, y_train_tf_copy))
data_train_batches = data_tf_tr.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

data_tf_val=tf.data.Dataset.from_tensor_slices((X_val, y_val_tf_copy))
data_val_batches = data_tf_val.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

data_tf_te=tf.data.Dataset.from_tensor_slices((X_test, y_test_tf_copy))
data_test_x_y_batches = data_tf_te.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

In [63]:
#Only for X_test
data_tf_tre=tf.data.Dataset.from_tensor_slices(X_test)
data_test_x_batches = data_tf_tre.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

# Model fitting

## Neural Network Settings

In [64]:
n,m=5,4

In [65]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Input,PReLU,LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

l_relu=LeakyReLU()
para_relu = PReLU()
e=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True,verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.001)

In [66]:
y_train_tf_copy.shape

(254171, 11)

In [67]:
input_shape_val=X_train.shape[1]
X_train.shape,input_shape_val

((254171, 50), 50)

In [None]:
model=Sequential()

#Input Layer
model.add(Input(shape=(input_shape_val,)))

#Hidden Layer
model.add(Dense((m*n)*input_shape_val , activation=para_relu ))
model.add(Dense((m*n)*input_shape_val , activation=para_relu ))
model.add(Dense((m*n)*input_shape_val , activation='relu'))
model.add(Dense((m*n)*input_shape_val , activation=para_relu ))
model.add(Dense((m*n)*input_shape_val , activation=para_relu ))

#Output Layer
model.add(Dense(output_shape,activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics = [ 'accuracy',
                         tf.keras.metrics.AUC(name='AUC_ROC',curve='ROC',num_thresholds=10000) ,
                         tf.metrics.F1Score(name='F1_Score',average='macro',threshold=0.5)
                        ]
             )

In [69]:
model.summary()

In [70]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png',show_shapes=True,show_dtype=True,show_layer_activations=True)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [71]:
%%time
history=model.fit(data_train_batches,epochs=200,callbacks=[e,reduce_lr],
                  validation_data=data_val_batches,verbose=1)

Epoch 1/200
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - AUC_ROC: 0.7884 - F1_Score: 6.2934e-05 - accuracy: 0.2850 - loss: 1.9431 - val_AUC_ROC: 0.8303 - val_F1_Score: 3.4468e-04 - val_accuracy: 0.2976 - val_loss: 1.7852 - learning_rate: 0.0100
Epoch 2/200
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - AUC_ROC: 0.8325 - F1_Score: 0.0041 - accuracy: 0.3294 - loss: 1.7735 - val_AUC_ROC: 0.8443 - val_F1_Score: 0.0140 - val_accuracy: 0.3417 - val_loss: 1.7318 - learning_rate: 0.0100
Epoch 3/200
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - AUC_ROC: 0.8467 - F1_Score: 0.0186 - accuracy: 0.3550 - loss: 1.7055 - val_AUC_ROC: 0.8442 - val_F1_Score: 0.0227 - val_accuracy: 0.3382 - val_loss: 1.7222 - learning_rate: 0.0100
Epoch 4/200
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - AUC_ROC: 0.8516 - F1_Score: 0.0247 - accuracy: 0.3626 - loss: 1.6807 - val_AUC_ROC: 0.84

# Testing

In [72]:
model.evaluate(data_test_x_y_batches)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - AUC_ROC: 0.8779 - F1_Score: 0.0722 - accuracy: 0.4136 - loss: 1.5389


[1.5343323945999146,
 0.41459527611732483,
 0.8787515163421631,
 0.07257645577192307]

In [73]:
score_dict={};j=0

In [74]:
def tester_func(model_list,X_test,y_test):
    for i,clf in enumerate(model_list):
       
        pred_proba=None;pred=None
        clf_name = clf.__class__.__name__

        try:
            pred_proba=clf.predict(X_test)
            pred=np.argmax(clf.predict(X_test), axis=-1)
        except NotFittedError as e:
            raise(e)

        print("-"*80)
        print(f"Predicting {clf_name} done")
        
        if(str(df[target].dtype)!='object'):
            print("Non object mode testing .........")
            score_dict[clf_name]={
                'roc_auc_score':roc_auc_score(y_test.values,pred_proba,multi_class='ovr'),
                'f1_score':f1_score(y_test.values,pred,average='macro'),
                'accuracy_score':accuracy_score(y_test.values,pred),
                'model_index':int(i)
            }
            
        else:
            print("Object mode testing .........")
            score_dict[clf_name]={
                'roc_auc_score':roc_auc_score(y_test,pred_proba,multi_class='ovr'),
                'f1_score':f1_score(y_test,pred,average='macro'),
                'accuracy_score':accuracy_score(y_test,pred),
                'model_index':int(i)
            }
    
        j=i
    
    print("-"*80)
    return j

In [75]:
indexer=tester_func([model],X_test,y_test_copy)

[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
--------------------------------------------------------------------------------
Predicting Sequential done
Object mode testing .........
--------------------------------------------------------------------------------


In [76]:
score_pd=pd.DataFrame(score_dict).transpose().sort_values('f1_score',ascending=False)
score_pd

Unnamed: 0,roc_auc_score,f1_score,accuracy_score,model_index
Sequential,0.794667,0.214576,0.414595,0.0


# Confusion matrix for Top model

In [77]:
prediction=np.argmax(model.predict(X_test), axis=-1)
prediction=l_eh.inverse_transform(prediction)

[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [78]:
print(classification_report( y_test.values , prediction ))

                    precision    recall  f1-score   support

              0-10       0.38      0.06      0.10      2325
             11-20       0.40      0.56      0.47      7710
             21-30       0.42      0.60      0.49      8632
             31-40       0.44      0.21      0.29      5431
             41-50       0.00      0.00      0.00      1154
             51-60       0.40      0.57      0.47      3446
             61-70       0.00      0.00      0.00       270
             71-80       0.50      0.00      0.00      1010
             81-90       0.27      0.05      0.09       476
            91-100       0.00      0.00      0.00       271
More than 100 Days       0.49      0.42      0.46       655

          accuracy                           0.41     31380
         macro avg       0.30      0.22      0.21     31380
      weighted avg       0.39      0.41      0.37     31380



In [79]:
sns.set_style('white')
ConfusionMatrixDisplay.from_predictions( y_test.values , prediction )
plt.tight_layout()

# Submission

In [81]:
test_df=pd.read_csv(path +'/healthcare/test_data.csv')
id_col=test_df['case_id']
test_df.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0


In [None]:
test_df.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
patientid                               0
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64

In [82]:
test_df.drop(col_drop_list,axis=1,inplace=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137057 entries, 0 to 137056
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Hospital_code                      137057 non-null  int64  
 1   Hospital_type_code                 137057 non-null  object 
 2   City_Code_Hospital                 137057 non-null  int64  
 3   Hospital_region_code               137057 non-null  object 
 4   Available Extra Rooms in Hospital  137057 non-null  int64  
 5   Department                         137057 non-null  object 
 6   Ward_Type                          137057 non-null  object 
 7   Ward_Facility_Code                 137057 non-null  object 
 8   Bed Grade                          137022 non-null  float64
 9   City_Code_Patient                  134900 non-null  float64
 10  Type of Admission                  137057 non-null  object 
 11  Severity of Illness                1370

In [83]:
X_test_df=test_df.copy()
X_test_df=ct.transform(X_test_df)

In [84]:
pred=np.argmax(model.predict(X_test_df), axis=-1)
pred=l_eh.inverse_transform(pred)

[1m4284/4284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step


In [86]:
sub_demo=pd.read_csv(path +'/healthcare/sample_sub.csv')
sub_demo.head()

Unnamed: 0,case_id,Stay
0,318439,0-10
1,318440,0-10
2,318441,0-10
3,318442,0-10
4,318443,0-10


In [87]:
sub=pd.DataFrame(data={sub_demo.columns.values[0]:id_col,sub_demo.columns.values[1]:pred.ravel()})
sub

Unnamed: 0,case_id,Stay
0,318439,0-10
1,318440,51-60
2,318441,21-30
3,318442,21-30
4,318443,51-60
...,...,...
137052,455491,11-20
137053,455492,11-20
137054,455493,11-20
137055,455494,11-20


In [88]:
sub.to_csv('submission.csv',index=False)

In [89]:
model.save('model.h5')



In [None]:
%reset -f

# Please UPVOTE if you like the notebook.