In [11]:
import pandas as pd
import numpy as np
import torch
import random
from typing import Union

import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from pathlib import Path
from Simple_Model.features import smiles_to_graph
from Simple_Model.models.gnn import GNNModel
from Simple_Model.models.message_passing import MPNN
from Simple_Model.models.hgnn import HGNNModel
from Simple_Model.models.ginn import GINModel
from Simple_Model.models.gat import GATModel

In [12]:
import os
data_path = "D:/PropPredictionModel/PropPredictionModel/tests/toxicity/data"
os.listdir(data_path)
train_df_Smiles = pd.read_csv(data_path+'/NR-ER-train/names_smiles.csv')#,header = True)
train_df_Labels = pd.read_csv(data_path+'/NR-ER-train/names_labels.csv')#,header = True)
train_df = pd.merge(train_df_Smiles, train_df_Labels, on='IDs')
train_df.head()

Unnamed: 0,IDs,SMILES,Labels
0,NCGC00260230-01,F[P-](F)(F)(F)(F)F.CCCC[N+]1=CC=CC(C)=C1,0
1,NCGC00184995-01,[H][C@@]12CC[C@H](OP(O)(O)=O)[C@@]1(C)CC[C@]3(...,1
2,NCGC00260471-01,[O-][N+](=O)C1=CC=C2NN=CC2=C1,0
3,NCGC00256746-01,CCC1=NC=CN=C1C,0
4,NCGC00183024-01,CCCN(CCC)C(=O)C(CCC(=O)OCCCN1CCN(CCOC(=O)CC2=C...,1


In [17]:
train_df.to_csv(data_path+'/train_tox_data.csv')
test_df.to_csv(data_path+'/test_tox_data.csv')

In [13]:
test_df_Smiles = pd.read_csv(data_path+'/NR-ER-test/names_smiles.csv')#,header = True)
test_df_Labels = pd.read_csv(data_path+'/NR-ER-test/names_labels.csv')#,header = True)
test_df = pd.merge(test_df_Smiles, test_df_Labels, on='IDs')
test_df.head()

Unnamed: 0,IDs,SMILES,Labels
0,NCGC00261443-01,CNC1=C2N=CN([C@@H]3O[C@H](CO)C(O)[C@H]3O)C2=NC=N1,0
1,NCGC00261600-01,OC1=CC(\C=C\C2=CC=C(O)C(O)=C2)=CC(O)=C1,1
2,NCGC00260926-01,[Cl-].COC1=CC=C2C3=CC=C4C=C5OCOC5=CC4=C3[N+](C...,0
3,NCGC00261266-01,Br.CC1=C(CC(N)C(O)=O)C(O)=NO1,0
4,NCGC00261559-01,C1C(CC2=C1C=CC=C2)N3CCN(CC3)C4=CC=CC5=C4OCCO5,0


In [15]:
len(train_df),len(test_df)

(7697, 265)

In [4]:
graphs = []
for smiles, label in zip(train_df["SMILES"], train_df["Labels"]):
    try:
        # Convert SMILES to graph
        graph = smiles_to_graph(smiles)
        
        # Add the label as a tensor
        graph.y = torch.tensor([label], dtype=torch.float)  # Use torch.float for regression, torch.long for classification
        
        graphs.append(graph)
    except ValueError as e:
        print(f"Error processing SMILES: {smiles}. Error: {e}")


[00:17:07] Explicit valence for atom # 2 Cl, 2, is greater than permitted


Error processing SMILES: [NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-]. Error: Invalid SMILES string


[00:17:07] Explicit valence for atom # 3 Si, 8, is greater than permitted


Error processing SMILES: [Na+].[Na+].F[Si--](F)(F)(F)(F)F. Error: Invalid SMILES string


[00:17:08] Explicit valence for atom # 0 Cl, 2, is greater than permitted


Error processing SMILES: [Cl-][Pt]1([Cl-])NCCN1. Error: Invalid SMILES string




In [5]:
from torch_geometric.data import DataLoader

# Use a DataLoader for batching
loader = DataLoader(graphs, batch_size=64, shuffle=True)
loader_iter = iter(loader)

# Get the next batch
batch = next(loader_iter)

# Print the batch
print(batch)

DataBatch(x=[1040, 1], edge_index=[2, 2110], edge_attr=[2110, 3], y=[64], batch=[1040], ptr=[65])




In [6]:
# Define model and optimizer
model = GNNModel(in_node_features=1, in_edge_features=3, 
                 hidden_dim=64, num_classes=1)  # Regression or binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

print(model)
for epoch in range(10):
    total_loss = 0
    correct = 0
    for batch in loader:
        optimizer.zero_grad()
        logits = model(batch)  # Forward pass
        target = batch.y.view(-1, 1).float()

        # Compute loss
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()

        # Accuracy
        preds = torch.round(torch.sigmoid(logits))
        correct += (preds == target).sum().item()

    # Calculate metrics
    avg_loss_per_batch = total_loss / len(loader)  # Average loss per batch
    avg_loss_per_sample = total_loss / len(loader.dataset)  # Average loss per sample
    accuracy = correct / len(loader.dataset)  # Accuracy

    # Print metrics
    print(f"Epoch {epoch + 1}, Avg Loss per Batch: {avg_loss_per_batch:.4f}, Accuracy: {accuracy:.4f}")


GNNModel(
  (node_embedding): Linear(in_features=1, out_features=64, bias=True)
  (conv1): GINEConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ))
  (conv2): GINEConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ))
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)
Epoch 1, Avg Loss per Batch: 0.3864, Accuracy: 0.8782
Epoch 2, Avg Loss per Batch: 0.3735, Accuracy: 0.8782
Epoch 3, Avg Loss per Batch: 0.3655, Accuracy: 0.8782
Epoch 4, Avg Loss per Batch: 0.3624, Accuracy: 0.8782
Epoch 5, Avg Loss per Batch: 0.3634, Accuracy: 0.8782
Epoch 6, Avg Loss per Batch: 0.3573, Accuracy: 0.8782
Epoch 7, Avg Loss per Batch: 0.3559, Accuracy: 0.8782
Epoch 8, Avg Loss per Batch: 0.3569, Accuracy: 0.8782
Epoch 9, Avg Lo

In [7]:
# Define model and optimizer
model = MPNN(in_node_features=1, in_edge_features=3, hidden_dim=64, num_classes=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

# Example training loop
for epoch in range(10):
    total_loss = 0
    correct = 0  # Reset correct predictions for each epoch

    for batch in loader:  # Assuming 'loader' is your DataLoader
        optimizer.zero_grad()
        logits = model(batch)  # Forward pass
        target = batch.y.view(-1, 1).float()  # Reshape target for BCEWithLogitsLoss
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()

        # Accuracy
        preds = torch.round(torch.sigmoid(logits))  # Convert logits to probabilities, then round
        correct += (preds == target).sum().item()  # Accumulate correct predictions

    # Calculate metrics
    avg_loss_per_batch = total_loss / len(loader)  # Average loss per batch
    avg_loss_per_sample = total_loss / len(loader.dataset)  # Average loss per sample
    accuracy = correct / len(loader.dataset)  # Accuracy (normalize by dataset size)

    # Print metrics
    print(f"Epoch {epoch + 1}, Avg Loss per Batch: {avg_loss_per_batch:.4f}, Accuracy: {accuracy:.4f}")


Epoch 1, Avg Loss per Batch: 0.3940, Accuracy: 0.8707
Epoch 2, Avg Loss per Batch: 0.3786, Accuracy: 0.8770
Epoch 3, Avg Loss per Batch: 0.3755, Accuracy: 0.8770
Epoch 4, Avg Loss per Batch: 0.3703, Accuracy: 0.8778
Epoch 5, Avg Loss per Batch: 0.3676, Accuracy: 0.8782
Epoch 6, Avg Loss per Batch: 0.3709, Accuracy: 0.8782
Epoch 7, Avg Loss per Batch: 0.3692, Accuracy: 0.8782
Epoch 8, Avg Loss per Batch: 0.3685, Accuracy: 0.8782
Epoch 9, Avg Loss per Batch: 0.3656, Accuracy: 0.8782
Epoch 10, Avg Loss per Batch: 0.3639, Accuracy: 0.8782


In [8]:
model = HGNNModel(in_node_features=1, hidden_dim=64, num_classes=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(10):
    total_loss = 0
    correct = 0

    for batch in loader:
        optimizer.zero_grad()
        logits = model(batch)
        target = batch.y.view(-1, 1).float()
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.round(torch.sigmoid(logits))
        correct += (preds == target).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = correct / len(loader.dataset)
    print(f"Epoch {epoch + 1}, Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 1, Avg Loss: 0.3973, Accuracy: 0.8782
Epoch 2, Avg Loss: 0.3697, Accuracy: 0.8782
Epoch 3, Avg Loss: 0.3698, Accuracy: 0.8782
Epoch 4, Avg Loss: 0.3707, Accuracy: 0.8782
Epoch 5, Avg Loss: 0.3715, Accuracy: 0.8782
Epoch 6, Avg Loss: 0.3692, Accuracy: 0.8782
Epoch 7, Avg Loss: 0.3693, Accuracy: 0.8782
Epoch 8, Avg Loss: 0.3695, Accuracy: 0.8782
Epoch 9, Avg Loss: 0.3697, Accuracy: 0.8782
Epoch 10, Avg Loss: 0.3676, Accuracy: 0.8782


In [9]:
#GINModel
model = GINModel(in_node_features=1, hidden_dim=64, num_classes=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(10):
    total_loss = 0
    correct = 0

    for batch in loader:
        optimizer.zero_grad()
        logits = model(batch)
        target = batch.y.view(-1, 1).float()
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.round(torch.sigmoid(logits))
        correct += (preds == target).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = correct / len(loader.dataset)
    print(f"Epoch {epoch + 1}, Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 1, Avg Loss: 0.3828, Accuracy: 0.8782
Epoch 2, Avg Loss: 0.3863, Accuracy: 0.8782
Epoch 3, Avg Loss: 0.3848, Accuracy: 0.8782
Epoch 4, Avg Loss: 0.3836, Accuracy: 0.8782
Epoch 5, Avg Loss: 0.3821, Accuracy: 0.8782
Epoch 6, Avg Loss: 0.3776, Accuracy: 0.8782
Epoch 7, Avg Loss: 0.3786, Accuracy: 0.8782
Epoch 8, Avg Loss: 0.3797, Accuracy: 0.8782
Epoch 9, Avg Loss: 0.3768, Accuracy: 0.8782
Epoch 10, Avg Loss: 0.3775, Accuracy: 0.8782


In [10]:
#GATModel
model = GATModel(in_node_features=1, hidden_dim=64, num_classes=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(10):
    total_loss = 0
    correct = 0

    for batch in loader:
        optimizer.zero_grad()
        logits = model(batch)
        target = batch.y.view(-1, 1).float()
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.round(torch.sigmoid(logits))
        correct += (preds == target).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = correct / len(loader.dataset)
    print(f"Epoch {epoch + 1}, Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 1, Avg Loss: 0.3747, Accuracy: 0.8782
Epoch 2, Avg Loss: 0.3664, Accuracy: 0.8782
Epoch 3, Avg Loss: 0.3674, Accuracy: 0.8782
Epoch 4, Avg Loss: 0.3688, Accuracy: 0.8782
Epoch 5, Avg Loss: 0.3692, Accuracy: 0.8782
Epoch 6, Avg Loss: 0.3676, Accuracy: 0.8782
Epoch 7, Avg Loss: 0.3678, Accuracy: 0.8782
Epoch 8, Avg Loss: 0.3680, Accuracy: 0.8782
Epoch 9, Avg Loss: 0.3697, Accuracy: 0.8782
Epoch 10, Avg Loss: 0.3666, Accuracy: 0.8782
