In [35]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [36]:
# Load datasets
train_dir = "train.csv"
test_dir = "test.csv"

# Read the data
train_data = pd.read_csv(train_dir)
test_data = pd.read_csv(test_dir)

In [37]:
train_data.drop(columns=[col for col in ['case_id', 'patientid'] if col in train_data.columns], inplace=True)

In [38]:
stay_mapping = {
    '0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4,
    '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9,
    '100 Days': 9, 'More than 100 Days': 10
}
train_data['Stay'] = train_data['Stay'].map(stay_mapping)

In [39]:
# Fill bed grade and city code nulls with most common value (mode) for that feature
if 'Bed Grade' in train_data.columns and train_data['Bed Grade'].isnull().sum() > 0:
    train_data['Bed Grade'].fillna(train_data['Bed Grade'].mode()[0], inplace=True)
if 'City_Code_Patient' in train_data.columns and train_data['City_Code_Patient'].isnull().sum() > 0:
    train_data['City_Code_Patient'].fillna(train_data['City_Code_Patient'].mode()[0], inplace=True)

# Ensure Bed Grade and City Code are integers
if 'Bed Grade' in train_data.columns:
    train_data['Bed Grade'] = train_data['Bed Grade'].astype(int)
if 'City_Code_Patient' in train_data.columns:
    train_data['City_Code_Patient'] = train_data['City_Code_Patient'].astype(int)

In [40]:
# Convert categorical variables using the specified mappings
if 'Hospital_type_code' in train_data.columns:
    train_data['Hospital_type_code'] = train_data['Hospital_type_code'].map(lambda x: ord(x.lower()) - ord('a') + 1)
if 'Ward_Facility_Code' in train_data.columns:
    train_data['Ward_Facility_Code'] = train_data['Ward_Facility_Code'].map(lambda x: ord(x) - ord('A') + 1)
if 'Hospital_region_code' in train_data.columns:
    region_mapping = {'X': 1, 'Y': 2, 'Z': 3}
    train_data['Hospital_region_code'] = train_data['Hospital_region_code'].map(region_mapping)
if 'Ward_Type' in train_data.columns:
    train_data['Ward_Type'] = train_data['Ward_Type'].map(lambda x: ord(x) - ord('P') + 1)
if 'Type of Admission' in train_data.columns:
    admission_mapping = {'Trauma': 1, 'Urgent': 2, 'Emergency': 3}
    train_data['Type of Admission'] = train_data['Type of Admission'].map(admission_mapping)
if 'Severity of Illness' in train_data.columns:
    severity_mapping = {'Minor': 1, 'Moderate': 2, 'Extreme': 3}
    train_data['Severity of Illness'] = train_data['Severity of Illness'].map(severity_mapping)

In [41]:
label_encoders = {}

# Label Encode Age by decade
if 'Age' in train_data.columns:
    le = LabelEncoder()
    train_data['Age'] = le.fit_transform(train_data['Age'])
    label_encoders['Age'] = le

# Label Encode Department
if 'Department' in train_data.columns:
    le = LabelEncoder()
    train_data['Department'] = le.fit_transform(train_data['Department'])
    label_encoders['Department'] = le

In [42]:
#split train into two data sets, so that Stay is isolated
X = train_data.drop(columns=['Stay'])  # Creates a dataset without the 'Stay' column
y = train_data['Stay']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Define the number of input features and output classes
num_features = X_train.shape[1]  # Assuming X_train is already preprocessed
num_classes = y_train.nunique()  # Number of unique length-of-stay categories

In [43]:
print(f"Unique values in y_train: {np.unique(y_train)}")
print(f"Unique values in y_test: {np.unique(y_test)}")

Unique values in y_train: [ 0  1  2  3  4  5  6  7  8  9 10]
Unique values in y_test: [ 0  1  2  3  4  5  6  7  8  9 10]


In [44]:
# Build the model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(num_features,)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',  # Use 'categorical_crossentropy' if one-hot encoded
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train, y_train, 
    validation_data=(X_train, y_train), 
    epochs=50, 
    batch_size=32, 
    class_weight=None,  # Adjust if imbalance exists
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7961/7961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4ms/step - accuracy: 0.2536 - loss: 2.0176 - val_accuracy: 0.2748 - val_loss: 1.8990
Epoch 2/50
[1m7961/7961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 6ms/step - accuracy: 0.2723 - loss: 1.9049 - val_accuracy: 0.2748 - val_loss: 1.9001
Epoch 3/50
[1m7961/7961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 5ms/step - accuracy: 0.2746 - loss: 1.9018 - val_accuracy: 0.2748 - val_loss: 1.8990
Epoch 4/50
[1m7961/7961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 5ms/step - accuracy: 0.2750 - loss: 1.8989 - val_accuracy: 0.2748 - val_loss: 1.8985
Epoch 5/50
[1m7961/7961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 6ms/step - accuracy: 0.2737 - loss: 1.9027 - val_accuracy: 0.2454 - val_loss: 1.9002
Epoch 6/50
[1m7961/7961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 7ms/step - accuracy: 0.2738 - loss: 1.8976 - val_accuracy: 0.2748 - val_loss: 1.8985
Epoch 7/50
[1m7961/7