In [None]:
import pandas as pd

import torch
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import mean_squared_error
from math import sqrt

from common import *

#### Load data

In [None]:
train_data = pd.read_csv("../../data/ld50/train.csv")
test_data = pd.read_csv("../../data/ld50/test.csv")

y_train = train_data["ld50"]
y_test = test_data["ld50"]

x_train = pd.read_csv("train_embeddings.csv")
x_test = pd.read_csv("test_embeddings.csv")

#### Load model and extend layers

In [None]:
model = torch.nn.Sequential(
            torch.nn.Linear(768, 1024),
            torch.nn.BatchNorm1d(1024),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(1024, 1024),
            torch.nn.BatchNorm1d(1024),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(1024, 1)
        )
model

#### Head training

In [None]:
class SmilesDataset(Dataset):
    def __init__(self, x: pd.DataFrame, y: pd.Series):
        self.X = x
        self.Y = y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, index: int):
        x = torch.tensor(self.X.iloc[index]).float()
        y = torch.tensor(self.Y.iloc[index]).float()
        return x, y

train_dataset = SmilesDataset(x_train, y_train)
test_dataset = SmilesDataset(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=12e-5)
criterion = torch.nn.MSELoss()
num_epochs = 200

writer = SummaryWriter()

for epoch_index in range(num_epochs):
    running_loss = 0.
    train_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for data in train_dataloader:
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = criterion(outputs, labels.float().unsqueeze(-1))
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

    train_loss = running_loss / len(train_dataloader) # loss per batch
    writer.add_scalar('Train/loss', train_loss)

    test_loss = 0.
    model.eval()
    with torch.no_grad():
        for data in test_dataloader:
            inputs, labels = data
            outputs = model(inputs)
            loss = criterion(outputs, labels.float().unsqueeze(-1))
            test_loss += loss.item()
    model.train()
    test_loss /= len(test_dataloader)
    writer.add_scalar('Test/loss', train_loss)

In [None]:
# Tempo de treinamento 24min18s
from datetime import datetime
torch.save(model.state_dict(), f"nn_model_{datetime.now().isoformat()}")

In [None]:
with torch.no_grad():
    inputs = torch.tensor(x_test.values).float()
    outputs = model(inputs).squeeze(-1)

import matplotlib.pyplot as plt

max_val = max(outputs.max(), outputs.max())
min_val = min(y_test.min(), y_test.min())
plt.plot([[min_val], [max_val]], [[min_val], [max_val]])
plt.scatter(y_test, outputs, color='r')

In [None]:
from sklearn.metrics import r2_score

f"{r2_score(y_test, outputs)=} {sqrt(mean_squared_error(y_test, outputs))=}"