In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))

import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from src.data import download_data, build_preprocessor, DiabetesDataset

# Download & Load Data

In [2]:
download_data()

Dataset already exists, skipping download.


In [3]:
diabetes_data = pd.read_csv("../data/diabetes/diabetic_data.csv")

# Preprocess Data

In [4]:
preprocessor = build_preprocessor()

In [5]:
X = diabetes_data.drop(columns=["readmitted"])
y = diabetes_data["readmitted"]

X_preprocessed = preprocessor.fit_transform(X)
y_preprocessed = OneHotEncoder(sparse_output=False).fit_transform(
    y.values.reshape(-1, 1)
)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y_preprocessed, test_size=0.2, random_state=42
)

In [7]:
batch_size = 32
torch.manual_seed(42)

train_dataset = DiabetesDataset(X_train, y_train)
test_dataset = DiabetesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)