In [73]:
import os
import pandas as pd
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn, optim
from google.colab import drive
from tqdm import tqdm

In [74]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device is: {device}\n")

Device is: cuda



In [76]:
data_path = '/content/drive/MyDrive/biomedicine/embedded_data/'

# Hyper- and Controlparameter

In [77]:
# Controlparameter

LOAD_FROM_DRIVE = False
SAVE_TO_DRIVE = False

In [78]:
# HYPERPARAMETER

BATCH_SIZE = 128
LEARNING_RATE = 0.0001
NUM_EPOCHS = 10

# Custom Dataset & Data Preparation

In [79]:
# label_map = {
#     'mild/moderate': 0,
#     'severe/critical': 1,
# }

In [80]:
label_map = {
    'control': 0,
    'mild/moderate': 1,
    'severe/critical': 1,
}

In [81]:
class EmbeddedDataset(Dataset):
    def __init__(self, pkl_file):
        self.data = pd.read_pickle(pkl_file)
        unique_labels = pd.unique(self.data['severity'])
        label_counts = {}
        for label in self.data['severity']:
            label_counts[label] = label_counts.get(label, 0) + 1
        print(f"unique labels: {unique_labels}")
        print(f"label counts: {label_counts}")
        print(self.data.head())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        embeddings = torch.tensor(self.data.iloc[idx, 0], dtype=torch.float32)
        label = torch.tensor(label_map.get(self.data.iloc[idx, 1]), dtype=torch.float32)
        return embeddings, label

In [82]:
dataset = EmbeddedDataset(os.path.join(data_path, 'embedded_data_split0_5.pkl'))

unique labels: ['mild/moderate' 'severe/critical' 'control']
label counts: {'mild/moderate': 128980, 'severe/critical': 83187, 'control': 80373}
                                          embeddings       severity
0  [0.026112404, -0.021238996, 0.0003707884, -0.0...  mild/moderate
1  [0.03304511, -0.013378126, 0.008212402, -0.005...  mild/moderate
2  [0.020730188, -0.019310804, 0.00035625693, -0....  mild/moderate
3  [0.028404342, -0.018271472, 0.006720709, 0.000...  mild/moderate
4  [0.02587356, -0.014935524, -0.012807032, -0.00...  mild/moderate


In [83]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [84]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [85]:
for (embeddings, labels) in train_loader:
    print(type(embeddings[0]))
    print(type(labels[0]))
    print(type(embeddings[0][0]))
    print(type(labels[0].item()))
    break

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'float'>


# Model Architecture


In [86]:
class Classifier(torch.nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=512, out_features=1024),
            nn.LeakyReLU(0.2),
            nn.Linear(in_features=1024, out_features=2048),
            nn.LeakyReLU(0.2),
            nn.Linear(in_features=2048, out_features=2048),
            nn.LeakyReLU(0.2),
            nn.Linear(in_features=2048, out_features=1024),
            nn.LeakyReLU(0.2),
            nn.Linear(in_features=1024, out_features=1024),
            nn.LeakyReLU(0.2),
            nn.Linear(in_features=1024, out_features=1),
            nn.LeakyReLU(0.2),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x).squeeze()

In [87]:
# trainings loop

def train(model, train_loader, num_epocs, optimizer, criterion):
    for epoch in range(num_epocs):
        model.to(device).train()
        running_loss = 0.0
        dataloader = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epocs}")
        for i, (embeddings, label) in enumerate(dataloader):
            embeddings, label = embeddings.to(device), label.to(device)
            optimizer.zero_grad()
            output = model(embeddings)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            dataloader.set_postfix({"loss":running_loss/(i+1)})

In [88]:
# directory paths to google drive for model saving and retrieving

directory_path = 'content/MyDrive/biomedicine/models/classifier'

binary_classifier_model_path = os.path.join(directory_path, 'binary_classifier.pth')

In [89]:
# custom weights initialization
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.normal_(m.weight.data, 0.0, 1/m.weight.shape[1])

In [90]:
# Initialize the model

classifier_model = Classifier().to(device)

if LOAD_FROM_DRIVE:
    try:
        classifier_model.load_state_dict(torch.load(binary_classifier_model_path, weights_only=True))
        print("Model loaded from drive")
    except Exception as e:
        print(f"Error loading model from drive: {e}")
# else:
#     classifier_model.apply(weights_init)

# set up loss function and optimizer

criterion = nn.BCELoss()
optimizer = optim.Adam(classifier_model.parameters(), lr=LEARNING_RATE)

# Training

In [91]:
# start training loop
if not LOAD_FROM_DRIVE:
    train(
        model=classifier_model,
        train_loader=train_loader,
        num_epocs=NUM_EPOCHS,
        optimizer=optimizer,
        criterion=criterion
    )

Epoch 1/10: 100%|██████████| 1829/1829 [00:32<00:00, 55.68it/s, loss=0.274]
Epoch 2/10: 100%|██████████| 1829/1829 [00:31<00:00, 57.20it/s, loss=0.178]
Epoch 3/10: 100%|██████████| 1829/1829 [00:32<00:00, 56.05it/s, loss=0.15]
Epoch 4/10: 100%|██████████| 1829/1829 [00:31<00:00, 57.35it/s, loss=0.133]
Epoch 5/10: 100%|██████████| 1829/1829 [00:32<00:00, 56.13it/s, loss=0.122]
Epoch 6/10: 100%|██████████| 1829/1829 [00:32<00:00, 56.86it/s, loss=0.115]
Epoch 7/10: 100%|██████████| 1829/1829 [00:32<00:00, 56.47it/s, loss=0.108]
Epoch 8/10: 100%|██████████| 1829/1829 [00:31<00:00, 57.19it/s, loss=0.103]
Epoch 9/10: 100%|██████████| 1829/1829 [00:32<00:00, 56.85it/s, loss=0.0983]
Epoch 10/10: 100%|██████████| 1829/1829 [00:32<00:00, 57.16it/s, loss=0.0958]


In [92]:
# save to drive

if SAVE_TO_DRIVE:
    try:
        os.makedirs(directory_path)
    except FileExistsError:
        pass

    torch.save(classifier_model.state_dict(), binary_classifier_model_path)
    print("Model saved to drive")

# Evaluation

In [99]:
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for embeddings, label in test_loader:
            embeddings, label = embeddings.to(device), label.to(device)
            outputs = [0 if i < 0.5 else 1 for i in model(embeddings)]
            for i in range (len(outputs)):
                if outputs[i] == label[i]:
                    correct += 1
                total += 1
    print(f"Accuracy: {(correct*100/total):.2f} %")

In [100]:
evaluate(classifier_model, test_loader)

Accuracy: 95.95 %
