In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from ase import Atoms
from ase.io import read
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import to_networkx
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split
from ase.build import molecule
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import random
from torch_geometric.nn import global_add_pool, GATConv, CGConv
from torch_geometric.nn.models.schnet import GaussianSmearing
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt



  _torch_pytree._register_pytree_node(


In [3]:
bio_df = pd.read_csv("data/bio_data.csv")

In [4]:
bio_df.head()


Unnamed: 0,index,formula,data_object,node_dim,bio
0,1009,C14H23N,"Data(x=[15, 54], edge_index=[2, 15], edge_attr...",15,1
1,964,C3H6O2,"Data(x=[5, 54], edge_index=[2, 4], edge_attr=[...",5,1
2,962,C8H19NO,"Data(x=[10, 54], edge_index=[2, 9], edge_attr=...",10,1
3,955,C13H26O4,"Data(x=[17, 54], edge_index=[2, 16], edge_attr...",17,1
4,2597,C20H26O4,"Data(x=[24, 54], edge_index=[2, 26], edge_attr...",24,1


In [5]:
bio_df['bio'].value_counts()

0    1514
1     744
Name: bio, dtype: int64

In [8]:
bio_samples = bio_df[bio_df['bio'] == 1]
non_bio_samples = bio_df[bio_df['bio'] == 0]

target_size = 1514

balanced_toxic_samples = bio_samples.sample(n=target_size, replace=True, random_state=42)
balanced_non_toxic_samples = non_bio_samples.sample(n=target_size, replace=False, random_state=42)

balanced_dataset = pd.concat([balanced_toxic_samples, balanced_non_toxic_samples])

balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

print(len(balanced_non_toxic_samples), len(balanced_toxic_samples))
print(len(balanced_dataset))

1514 1514
3028


In [None]:
train_data, test_data = train_test_split(balanced_dataset, test_size=0.2, random_state=49)

In [None]:
batch_size = 8

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:

cutoff = 8.0
edge_dim = 50
node_dim = 54

class GraphEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads):
        super(GraphEncoder, self).__init__()
        self.conv1 = GATConv(input_dim+25, hidden_dim, heads=num_heads)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads)
        self.conv3 = CGConv(input_dim+25, edge_dim)
        self.dropout = nn.Dropout(0.4)
        self.batch = nn.BatchNorm1d(hidden_dim*num_heads)

    def forward(self, x, edge_index, edge_attr, positions):
        #x = torch.cat([x, positions], dim=1)
        x = self.conv1(x, edge_index, edge_attr)
        x = nn.GELU()(x)
        x = self.dropout(x)
        x = self.batch(x)
        #x = self.conv2(x, edge_index, edge_attr)
        #x = nn.ReLU()(x)
        #x = self.dropout(x)
        #x = self.batch(x)
        #x = self.conv3(x, edge_index, edge_attr)
        #x = nn.ReLU()(x)
        #x = self.dropout(x)
        #x = self.batch(x)
        return x

class BioClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads):
        super(BioClassifier, self).__init__()
        self.pre_graph = [
            nn.Linear(input_dim+25, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, input_dim+25),
            nn.ReLU(),
        ]
        self.encoder = GraphEncoder(input_dim, hidden_dim, num_heads)
        self.fc = nn.Linear(hidden_dim * num_heads, 12)

    def forward(self, data):
        x, edge_index, edge_attr, positions = data.x, data.edge_index, data.edge_attr, data.positions
        x = x.float()

        #gaussian_smearing = GaussianSmearing(0.0, cutoff, edge_dim)

        #row, col = edge_index
        #edge_distances = torch.norm(positions[row] - positions[col], dim=1)
        #edge_attr = torch.cat([edge_attr, gaussian_smearing(edge_distances)], dim=1)

        #for layer in self.pre_graph:
        #    x = layer(x)
        x = self.encoder(x, edge_index, edge_attr, positions)
        x = global_add_pool(x, data.batch)
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x.squeeze(dim=1)

In [None]:
# Initialize model
hidden_dim = 2
model = BioClassifier(node_dim, hidden_dim, 2)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [None]:
# Train model
num_epochs = 200
losses = []
for epoch in range(num_epochs):
    model.train()
    for graph in train_loader:
        optimizer.zero_grad()
        output = model(graph)

        loss = criterion(output.to(torch.float), (graph.tox).to(torch.float))
        loss.requires_grad = True
        loss.backward()
        optimizer.step()
        #scheduler.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    if epoch % 5 == 0:
        losses.append(loss.item())


In [None]:
# Evaluate model
model.eval()

y_true = []
y_pred = []
with torch.no_grad():
    correct = 0
    correct_0s = 0
    correct_1s = 0
    total_1s = 0
    total_0s = 0
    total = 0
    tote_0 = 0
    tote_1 = 0
    for graph in test_data:
        output = model(graph)
        predicted = torch.round(output)
        total += 1
        y_true.append(int(graph.tox))
        y_pred.append(predicted)
        if predicted == 0:
            tote_0 += 1
        if predicted == 1:
            tote_1 += 1
        if predicted == int(graph.tox):
            correct += 1
        if int(graph.tox) == 1:
            total_1s += 1
            if predicted == 1:
                correct_1s += 1
        if int(graph.tox) == 0:
            total_0s += 1
            if predicted == 0:
                correct_0s += 1

    accuracy = correct / total
    print(f'Accuracy: {accuracy}')
    print(correct_0s, " correct 0 predictions out of ", total_0s)
    print(correct_1s, " correct 1 predictions out of ", total_1s)
    print("Total 0s Predicted: ", tote_0)
    print("Total 1s Predicted: ", tote_1)

In [None]:
from sklearn.metrics import roc_auc_score
auc_roc = roc_auc_score(y_true, y_pred)
print("ROC AUC Score:", auc_roc)

In [None]:
loss_list=[]
for i in losses:
    loss_list.append(i)

epochs = range(0, 40)

plt.plot(epochs, loss_list, label='Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()
plt.show()