In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from train_test_split import site_train_val_test_split

sns.set_palette("colorblind")
df = pd.read_csv('../data/merged_data_cleaned.csv', low_memory=False)
target = pd.read_csv('../data/target.csv')
target.columns = target.columns.str.lower()

# Rename target["csfractures"] to target["csi"]
target.rename(columns={"csfractures": "csi"}, inplace=True)

print(df.shape, target.shape)



(3314, 620) (3314, 2)


In [2]:
# Eliminate any column is object type
df = df.select_dtypes(exclude=['object'])

# Change -1 to 0
df = df.replace(-1, 0)

# Drop sectiongcsavailable
df = df.drop(columns='sectiongcsavailable')
df = df.select_dtypes(exclude=['object'])
target = target.set_index('studysubjectid').loc[df['studysubjectid']].reset_index()
df_feat = df.drop(columns=["site", "caseid", "studysubjectid"])
df.drop(columns=["ageinyears"], inplace=True)

In [3]:
df.fillna(0, inplace=True)
nan_counts = df.isnull().sum()
print(nan_counts[nan_counts > 0])

Series([], dtype: int64)


In [4]:
train_df, val_df, test_df, train_target, val_target, test_target = site_train_val_test_split(df, target, random_state=42)

In [20]:
import torch
from sklearn.metrics import accuracy_score, f1_score

class MLP(torch.nn.Module):
    def __init__(self, input_size, units, dropout_list, output_size):
        super(MLP, self).__init__()
        self.units = units
        for i, u in enumerate(units):
            setattr(self, f'fc{i}', torch.nn.Linear(input_size, u))
            setattr(self, f'dropout{i}', torch.nn.Dropout(p=dropout_list[i]))
            setattr(self, f'relu{i}', torch.nn.ReLU())
            # setattr(self, f'batchnorm{i}', torch.nn.BatchNorm1d(u))
            input_size = u
        self.sigmoid = torch.nn.Sigmoid()
        self.fc_out = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        for i in range(len(self.units)):
            x = getattr(self, f'fc{i}')(x)
            x = getattr(self, f'dropout{i}')(x)
            x = getattr(self, f'relu{i}')(x)
            # x = getattr(self, f'batchnorm{i}')(x)
        x = self.fc_out(x)
        x = self.sigmoid(x)
        return x


model = MLP(input_size=train_df.shape[1], units=[32, 32, 8], dropout_list=[0.01, 0.01, 0.01], output_size=1)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.01)


X_train_tensor = torch.tensor(train_df, dtype=torch.float32)
y_train_tensor = torch.tensor(train_target, dtype=torch.float32).view(-1, 1)

X_val_tensor = torch.tensor(val_df, dtype=torch.float32)
y_val_tensor = torch.tensor(val_target, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(test_df, dtype=torch.float32)
y_test_tensor = torch.tensor(test_target, dtype=torch.float32).view(-1, 1)

train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = torch.utils.data.TensorDataset(X_val_tensor, y_val_tensor)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=100):
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            # outputs = torch.sigmoid(outputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0.0
        for inputs, targets in val_loader:
            outputs = model(inputs)
            # outputs = torch.sigmoid(outputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        val_accuracy = accuracy_score(y_val_tensor, model(X_val_tensor).detach().numpy() > 0.5)
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        print(f'F1 score: {f1_score(y_val_tensor, model(X_val_tensor).detach().numpy() > 0.5, average="weighted")}')
    return train_losses, val_losses

train_losses, val_losses = train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10)

Epoch 1/10, Train Loss: 0.3799, Val Loss: 0.3307, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 2/10, Train Loss: 0.3245, Val Loss: 0.3314, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 3/10, Train Loss: 0.3258, Val Loss: 0.3251, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 4/10, Train Loss: 0.3202, Val Loss: 0.3257, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 5/10, Train Loss: 0.3157, Val Loss: 0.3273, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 6/10, Train Loss: 0.3178, Val Loss: 0.3278, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 7/10, Train Loss: 0.3154, Val Loss: 0.3315, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 8/10, Train Loss: 0.3162, Val Loss: 0.3304, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 9/10, Train Loss: 0.3232, Val Loss: 0.3289, Val Accuracy: 0.8990
F1 score: 0.8512414800389484
Epoch 10/10, Train Loss: 0.3164, Val Loss: 0.3309, Val Accuracy: 0.8990
F1 score: 0.8512414800389484