In [1]:
import sys
import os
import joblib

sys.path.append(os.path.abspath(".."))

import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from src.data import download_data, build_preprocessor, DiabetesDataset
from src.models import MLPModel, LinearRegressionModel, RandomForestModel
from src.train import train_torch_model

# Download & Load Data

In [2]:
download_data()

Dataset already exists, skipping download.


In [3]:
diabetes_data = pd.read_csv("../data/diabetes/diabetic_data.csv")

# Preprocess Data

In [4]:
X = diabetes_data.drop(columns=["readmitted"])
y = diabetes_data["readmitted"]


In [5]:
preprocessor_path = "../preprocessor/preprocessor.joblib"
if os.path.exists(preprocessor_path):
    preprocessor = joblib.load(preprocessor_path)
    X_processed = preprocessor.transform(X)

    print("Loaded existing preprocessor")
else:
    preprocessor = build_preprocessor()
    X_processed = preprocessor.fit_transform(X)
    joblib.dump(preprocessor, preprocessor_path)

    print("Created & saved new preprocessor")

y_processed = OneHotEncoder(sparse_output=False).fit_transform(y.values.reshape(-1, 1))

Loaded existing preprocessor


In [6]:
y_processed

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], shape=(101766, 3))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, test_size=0.2, random_state=42
)

In [8]:
batch_size = 32
torch.manual_seed(42)

train_dataset = DiabetesDataset(X_train, y_train)
test_dataset = DiabetesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Train models

## MLP Model

In [9]:
######### HYPERPARAMETERS #########
mlp_input_dim = X_train.shape[1]
mlp_hidden_dims = [512, 128, 32]
mlp_output_dim = y_train.shape[1]

mlp_learning_rate = 1e-3
mlp_num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)
###################################

In [10]:
mlp_model = MLPModel(
    input_dim=mlp_input_dim,
    hidden_dims=mlp_hidden_dims,
    output_dim=mlp_output_dim,
)
print(mlp_model)

MLPModel(
  (network): Sequential(
    (0): Linear(in_features=2446, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=3, bias=True)
  )
)


In [11]:
mlp_loss = torch.nn.CrossEntropyLoss()
mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=mlp_learning_rate)

In [12]:
train_torch_model(
    model=mlp_model,
    train_loader=train_loader,
    criterion=mlp_loss,
    optimizer=mlp_optimizer,
    num_epochs=mlp_num_epochs,
    device=device,
)

Epoch 1/5, Loss: 2034.4400
Epoch 2/5, Loss: 3.6412
Epoch 3/5, Loss: 0.9687
Epoch 4/5, Loss: 0.9450
Epoch 5/5, Loss: 0.9451
