In [36]:
import numpy as np
import torch
import torch.nn as nn

from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold
from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingLR
from torch.utils.data import DataLoader, Subset

from src.data.dataset import CNSDataset
from src.descriptors import DescriptorGenerator, AVAILABLE_DESCRIPTORS

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.random.manual_seed(0)

<torch._C.Generator at 0x25077b67cb0>

In [37]:
def train_mlp(model, loss_fn, optimizer, scheduler, train_loader, total_epoch=300):
    model.train()
    for nepoch in range(total_epoch):
        for batch in train_loader:
            y_pred = model(batch[0])
            loss = loss_fn(y_pred, batch[1].to(device).reshape(-1, 1).float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
    return model

In [3]:
TRAIN_DATASET = "dataset\mol_train.csv"
TEST_DATASET = "dataset\mol_test.csv"

whole_dataset = CNSDataset(TRAIN_DATASET, transform=DescriptorGenerator(AVAILABLE_DESCRIPTORS))
mean = torch.mean(whole_dataset._processed_data, axis=0)
std = torch.std(whole_dataset._processed_data, axis=0)
max = torch.max(whole_dataset._processed_data, axis=0).values
min = torch.min(whole_dataset._processed_data, axis=0).values
whole_dataset.normalize(max, min)

nfeatures = whole_dataset._processed_data.shape[1]

print(f"Now the shape is {whole_dataset._processed_data.shape}")

Output()

NaN in descriptors, remove it
Now the shape is torch.Size([700, 2912])


In [54]:
total_epoch = 500
train_fs = []
train_acc = []
val_fs = []
val_acc = []
for fold, (train_idx, val_idx) in enumerate(
    KFold(n_splits=5, shuffle=True).split(whole_dataset)
):
    fold_train = Subset(whole_dataset, train_idx)
    fold_train_loader = DataLoader(fold_train, batch_size=32)
    fold_val = whole_dataset[val_idx]
    model = nn.Sequential(
        # nn.Linear(nfeatures, 3072),
        # nn.LayerNorm(3072),
        # nn.ReLU(),
        # nn.Dropout(0.8),
        # nn.Linear(3072, 2048),
        # nn.LayerNorm(2048),
        # nn.ReLU(),
        # nn.Dropout(0.6),
        nn.Linear(nfeatures, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),
        nn.Dropout(0.4),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Dropout(0.4),
        nn.Linear(512, 128),
        nn.LayerNorm(128),
        nn.ReLU(),
        nn.Dropout(0.4),
        nn.Linear(128, 1),
        nn.Sigmoid(),
    )
    model.to(device)
    loss_fn = nn.BCELoss()
    loss_fn.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3)
    # scheduler = CosineAnnealingLR(optimizer, 50, eta_min=1e-20)
    scheduler = OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=total_epoch,
        steps_per_epoch=int(700 * 0.8 / 32) + 1,
        final_div_factor=1e6,
    )
    model = train_mlp(
        model, loss_fn, optimizer, scheduler, fold_train_loader, total_epoch=total_epoch
    )

    model.eval()
    y_pred = model(whole_dataset[train_idx][0])
    y_pred[y_pred > 0.5] = 1.0
    y_pred[y_pred <= 0.5] = 0.0
    y_pred = y_pred.detach().cpu().numpy().reshape(-1)
    train_fs.append(fbeta_score(y_pred, whole_dataset[train_idx][1], beta=2))
    train_acc.append(
        np.average(y_pred == whole_dataset[train_idx][1])
    )
    y_pred = model(whole_dataset[val_idx][0])
    y_pred[y_pred > 0.5] = 1.0
    y_pred[y_pred <= 0.5] = 0.0
    y_pred = y_pred.detach().cpu().numpy().reshape(-1)
    val_fs.append(fbeta_score(y_pred, whole_dataset[val_idx][1], beta=2))
    val_acc.append(
        np.average(y_pred == whole_dataset[val_idx][1])
    )
print(f"Train F2 score: {np.mean(train_fs):.3f}, Train accuracy: {np.mean(train_acc):.3f}\n"
      f"Validation F2 score: {np.mean(val_fs):.3f}, Validation accuracy: {np.mean(val_acc):.3f}")

Train F2 score: 0.997, Train accuracy: 0.998
Validation F2 score: 0.799, Validation accuracy: 0.854
