In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

from src.data import download_data, build_preprocessor, DiabetesDataset
from src.models import MLPModel, LinearRegressionModel, RandomForestModel
from src.train import train_torch_model

# Download & Load Data

In [None]:
download_data()

In [None]:
diabetes_data = pd.read_csv("../data/diabetes/diabetic_data.csv")

# Preprocess Data

In [None]:
diabetes_data.drop(
    ["weight", "payer_code", "medical_specialty", "encounter_id", "patient_nbr"],
    axis=1,
    inplace=True,
)
diabetes_data.drop_duplicates(inplace=True)
diabetes_data["readmitted"] = diabetes_data["readmitted"].apply(
    lambda x: 1 if x == "<30" else 0
)

In [None]:
X = diabetes_data.drop(columns=["readmitted"])
y = diabetes_data["readmitted"]


In [None]:
preprocessor = build_preprocessor()
X_processed = preprocessor.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=21
)
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
batch_size = 32
torch.manual_seed(21)

train_dataset = DiabetesDataset(X_train, y_train)
test_dataset = DiabetesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Train models

## MLP Model

In [None]:
######### HYPERPARAMETERS #########
mlp_input_dim = X_train.shape[1]
mlp_hidden_dims = [256, 128, 32]
mlp_output_dim = 1

mlp_learning_rate = 1e-3
mlp_num_epochs = 30

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)
###################################

In [None]:
mlp_model = MLPModel(
    input_dim=mlp_input_dim,
    hidden_dims=mlp_hidden_dims,
    output_dim=mlp_output_dim,
)
print(mlp_model)

In [None]:
class_counts = np.bincount(y_train.flatten())
weights = 1.0 / class_counts
weights = weights / weights.sum()
weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

In [None]:
mlp_loss = torch.nn.BCELoss(weight=weights_tensor)
mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=mlp_learning_rate)

In [None]:
train_torch_model(
    model=mlp_model,
    train_loader=train_loader,
    criterion=mlp_loss,
    optimizer=mlp_optimizer,
    num_epochs=mlp_num_epochs,
    device=device,
)

In [None]:
mlp_predictions = []
mlp_model.eval()
with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        outputs = mlp_model(inputs)
        _, predicted = torch.max(outputs, 1)
        mlp_predictions.extend(predicted.cpu().numpy())

In [None]:
mlp_predictions

## Linear Regression model

In [None]:
######### HYPERPARAMETERS #########
lr_input_dim = X_train.shape[1]
lr_output_dim = 1

lr_learning_rate = 1e-3
lr_num_epochs = 30

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)
###################################


In [None]:
lr_model = LinearRegressionModel(
    input_dim=lr_input_dim,
    output_dim=lr_output_dim,
)
print(lr_model)

In [None]:
class_counts = np.bincount(y_train.flatten())
weights = 1.0 / class_counts
weights = weights / weights.sum()
weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

In [None]:
lr_loss = torch.nn.BCELoss(weight=weights_tensor)
lr_optimizer = torch.optim.Adam(lr_model.parameters(), lr=lr_learning_rate)


In [None]:
train_torch_model(
    model=lr_model,
    train_loader=train_loader,
    criterion=lr_loss,
    optimizer=lr_optimizer,
    num_epochs=lr_num_epochs,
    device=device,
)


In [None]:
lr_predictions = []
lr_model.eval()
with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        outputs = lr_model(inputs)
        _, predicted = torch.max(outputs, 1)
        lr_predictions.extend(predicted.cpu().numpy())


In [None]:
lr_predictions

## Random Forest model

In [None]:
######### HYPERPARAMETERS #########
rf_n_estimators = 100
rf_max_depth = None
###################################

In [None]:
rf_model = RandomForestModel(
    n_estimators=rf_n_estimators,
    max_depth=rf_max_depth,
)
rf_model.fit(X_train, y_train)

In [None]:
rf_predictions = rf_model.predict(X_test)

# Evaluate

In [None]:
print("MLP Model Evaluation:")
print("Classification Report:\n", classification_report(y_test, mlp_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, mlp_predictions))

In [None]:
print("Linear Regression Model Evaluation:")
print("Classification Report:\n", classification_report(y_test, lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))

In [None]:
print("Random Forest Model Evaluation:")
print("Classification Report:\n", classification_report(y_test, rf_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))