In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import torch
import zipfile
import os

with zipfile.ZipFile("processed_graphs.pt.zip", "r") as zip_ref:
    zip_ref.extractall(".")

data_list = torch.load("processed_graphs.pt")
print("Loaded graphs:", len(data_list))
print("Example Graph Object:\n", data_list[0])

  data_list = torch.load("processed_graphs.pt")


Loaded graphs: 679269
Example Graph Object:
 Data(x=[25, 6], edge_index=[2, 54], edge_attr=[54, 4], y=[1], global_features=[1028])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_add_pool

class AttentionFusion(nn.Module):
    def __init__(self, graph_dim, fingerprint_dim, fusion_dim):
        super().__init__()
        self.query_graph = nn.Linear(graph_dim, fusion_dim)
        self.key_fp = nn.Linear(fingerprint_dim, fusion_dim)
        self.value_fp = nn.Linear(fingerprint_dim, fusion_dim)

    def forward(self, graph_repr, fingerprint):
        Q = self.query_graph(graph_repr).unsqueeze(1)
        K = self.key_fp(fingerprint).unsqueeze(1)
        V = self.value_fp(fingerprint).unsqueeze(1)

        attention_weights = torch.softmax(Q @ K.transpose(-2, -1) / (K.size(-1) ** 0.5), dim=-1)
        attended_fp = (attention_weights @ V).squeeze(1)

        return graph_repr + attended_fp

class MolGraphormer(nn.Module):
  def __init__(self, node_dim=6, edge_dim=4, hidden_dim=128, fingerprint_dim=1028, fusion_dim=128, output_dim=1):
        super().__init__()
        self.conv1 = GATConv(node_dim, hidden_dim, heads=4, concat=False, edge_dim=edge_dim)
        self.conv2 = GATConv(hidden_dim, hidden_dim, heads=4, concat=False, edge_dim=edge_dim)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.pool = global_add_pool
        self.att_fusion = AttentionFusion(hidden_dim, fingerprint_dim, fusion_dim)

        self.outlier_detector = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

        self.output_head = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, output_dim)
        )

  def forward(self, data):
    x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch

    x = self.norm1(self.conv1(x, edge_index, edge_attr))
    x = F.relu(x)
    x = self.norm2(self.conv2(x, edge_index, edge_attr))
    x = F.relu(x)

    graph_repr = self.pool(x, batch)

    batch_size = graph_repr.size(0)
    if data.global_features.dim() == 2:
        fingerprints = data.global_features
    else:
        fingerprints = data.global_features.view(batch_size, -1)

    fused_repr = self.att_fusion(graph_repr, fingerprints)

    outlier_score = self.outlier_detector(fused_repr)
    output = self.output_head(fused_repr)

    return output, outlier_score



In [None]:
from torch_geometric.data import Batch

sample_batch = Batch.from_data_list(data_list[:32])
model = MolGraphormer()
output, outlier_score = model(sample_batch)
print("Output:", output.shape)
print("Outlier Score:", outlier_score.shape)


Output: torch.Size([32, 1])
Outlier Score: torch.Size([32, 1])


In [None]:
import torch.nn as nn
import torch.optim as optim

task_loss_fn = nn.MSELoss()
outlier_loss_fn = nn.BCELoss()

def combined_loss_fn(task_output, target, outlier_score):
    task_loss = task_loss_fn(task_output, target)
    outlier_reg = torch.mean(outlier_score)
    total_loss = task_loss + 0.1 * outlier_reg
    return total_loss

In [None]:
model = MolGraphormer()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.7)

In [None]:
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(data_list, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np

def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output, outlier_score = model(batch)
        loss = combined_loss_fn(output, batch.y.view(-1, 1).float(), outlier_score)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_epoch(model, loader, device):
    model.eval()
    y_true, y_pred = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            output, outlier_score = model(batch)
            loss = combined_loss_fn(output, batch.y.view(-1, 1).float(), outlier_score)
            total_loss += loss.item()
            y_true.extend(batch.y.view(-1).tolist())
            y_pred.extend(output.view(-1).tolist())

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return total_loss / len(loader), mae, rmse, r2


In [None]:
import time
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 17
patience = 3
checkpoint_every = 3

best_val_loss = float('inf')
trigger_times = 0

for epoch in range(1, epochs + 1):
    start_time = time.time()

    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss, val_mae, val_rmse, val_r2 = eval_epoch(model, val_loader, device)
    scheduler.step()

    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | MAE: {val_mae:.4f} | RMSE: {val_rmse:.4f} | R2: {val_r2:.4f} | Time: {time.time() - start_time:.2f}s")
    if epoch % checkpoint_every == 0:
        torch.save(model.state_dict(), f"molgraphormer_epoch_{epoch}.pt")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        torch.save(model.state_dict(), "best_molgraphormer.pt")
        print("Saved new best model")
    else:
        trigger_times += 1
        print(f"No improvement for {trigger_times} epochs")

    if trigger_times >= patience:
        print(f"\n Early stopping at epoch {epoch} (patience: {patience})")
        break


In [None]:
from torch_geometric.loader import DataLoader
import torch

torch.manual_seed(42)

num_samples = len(data_list)
train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

train_size = int(train_ratio * num_samples)
val_size = int(val_ratio * num_samples)
test_size = num_samples - train_size - val_size

train_graphs = data_list[:train_size]
val_graphs = data_list[train_size:train_size + val_size]
test_graphs = data_list[train_size + val_size:]

train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=32, shuffle=False)
test_loader = DataLoader(test_graphs, batch_size=32, shuffle=False)

print(f"Train samples: {len(train_graphs)}")
print(f"Validation samples: {len(val_graphs)}")
print(f"Test samples: {len(test_graphs)}")


Train samples: 543415
Validation samples: 67926
Test samples: 67928


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

node_dim = 6
edge_dim = 4
hidden_dim = 128
fingerprint_dim = 1028
fusion_dim = 128
output_dim = 1

model = MolGraphormer(node_dim=node_dim, edge_dim=edge_dim, hidden_dim=hidden_dim,
                      fingerprint_dim=fingerprint_dim, fusion_dim=fusion_dim, output_dim=output_dim)

model.load_state_dict(torch.load("best_molgraphormer.pt", map_location=device))
model.to(device)

model.eval()

  model.load_state_dict(torch.load("best_molgraphormer.pt", map_location=device))


MolGraphormer(
  (conv1): GATConv(6, 128, heads=4)
  (conv2): GATConv(128, 128, heads=4)
  (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (att_fusion): AttentionFusion(
    (query_graph): Linear(in_features=128, out_features=128, bias=True)
    (key_fp): Linear(in_features=1028, out_features=128, bias=True)
    (value_fp): Linear(in_features=1028, out_features=128, bias=True)
  )
  (outlier_detector): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
    (3): Sigmoid()
  )
  (output_head): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

def evaluate_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            output, _ = model(batch)
            all_preds.append(output.cpu().numpy())
            all_labels.append(batch.y.cpu().numpy())

    y_true = np.concatenate(all_labels)
    y_pred = np.concatenate(all_preds)

    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))

    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"Test Set RMSE: {rmse:.4f}")
    print(f"Test Set MAE: {mae:.4f}")
    print(f"Test Set R² Score: {r2:.4f}")

    return y_true, y_pred


In [None]:
!pip install -U scikit-learn

