<a href="https://colab.research.google.com/github/Luly7/RT/blob/main/GNN_final_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install correct version of PyG (PyTorch Geometric) step-by-step
!pip install torch torchvision torchaudio  # (If you haven't yet)

# Install torch-scatter and torch-sparse first (required dependencies)
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html

# Now install torch-geometric
!pip install torch-geometric


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_scatter-2.1.2%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.0/494.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt20cpu
Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_sparse-0.6.18%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt20cpu
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# Full fix
!pip install torch torchvision torchaudio
!pip install torch-scatter torch-sparse torch-geometric -f https://data.pyg.org/whl/torch-2.0.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html


In [2]:
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool




In [3]:
# 1. Install / Fix dependencies
!pip install numpy==1.23.5
!pip install torch torchvision torchaudio
!pip install torch-scatter torch-sparse torch-geometric
!pip install rdkit-pypi tqdm scikit-learn

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import mean_absolute_error, r2_score
from rdkit import Chem
from tqdm import tqdm
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool

# 2. Load your SMILES dataset
csv_path = '/content/drive/MyDrive/CS6480/output_rt_fixed_smiles_unique.csv'
data = pd.read_csv(csv_path)

assert 'fixed_smiles' in data.columns
assert 'rt' in data.columns

smiles_list = data['fixed_smiles'].tolist()
retention_times = data['rt'].tolist()

# 3. Convert SMILES into Graph
def smiles_to_graph(smiles, rt):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    x = torch.tensor([atom.GetAtomicNum() for atom in mol.GetAtoms()], dtype=torch.float).view(-1, 1)
    edges = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()]
    edges += [(j, i) for (i, j) in edges]  # undirected
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    y = torch.tensor([rt], dtype=torch.float)
    return Data(x=x, edge_index=edge_index, y=y)

dataset = [smiles_to_graph(smi, rt) for smi, rt in tqdm(zip(smiles_list, retention_times), total=len(smiles_list)) if smiles_to_graph(smi, rt) is not None]

# 4. Split into train/test
train_size = int(0.8 * len(dataset))
train_dataset = dataset[:train_size]
test_dataset = dataset[train_size:]

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Define Better GNN
class BetterGNN(nn.Module):
    def __init__(self):
        super(BetterGNN, self).__init__()
        self.conv1 = GCNConv(1, 32)
        self.conv2 = GCNConv(32, 32)
        self.conv3 = GCNConv(32, 32)
        self.fc1 = nn.Linear(32, 16)
        self.fc2 = nn.Linear(16, 1)
        self.dropout = nn.Dropout(0.3)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = self.dropout(F.relu(self.fc1(x)))
        return self.fc2(x).view(-1)

# 6. Setup training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BetterGNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_fn = nn.MSELoss()

# 7. Training loop
n_epochs = 50
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = loss_fn(out, data.y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch}, Train Loss: {avg_loss:.4f}')

# 8. Final Evaluation
model.eval()
preds, trues = [], []
with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        preds.append(model(data).cpu().numpy())
        trues.append(data.y.view(-1).cpu().numpy())

preds = np.concatenate(preds)
trues = np.concatenate(trues)

# 9. Calculate RMSE, MAE, R²
rmse = np.sqrt(np.mean((preds - trues) ** 2))
mae = mean_absolute_error(trues, preds)
r2 = r2_score(trues, preds)

print(f"\n✅ Test RMSE: {rmse:.4f} seconds")
print(f"✅ Test MAE: {mae:.4f} seconds")
print(f"✅ Test R² Score: {r2:.4f}")

# 10. Save predictions to CSV
results_df = pd.DataFrame({
    'True_RetentionTime': trues,
    'Predicted_RetentionTime': preds
})
results_csv_path = '/content/drive/MyDrive/CS6480/GNN_final/predictions.csv'
results_df.to_csv(results_csv_path, index=False)
print(f"✅ Results saved to: {results_csv_path}")




100%|██████████| 79938/79938 [01:10<00:00, 1135.27it/s]


Epoch 1, Train Loss: 131588.3013
Epoch 2, Train Loss: 67246.4246
Epoch 3, Train Loss: 66873.7124
Epoch 4, Train Loss: 66729.8201
Epoch 5, Train Loss: 66732.0777
Epoch 6, Train Loss: 66478.2426
Epoch 7, Train Loss: 66894.2624
Epoch 8, Train Loss: 66620.1245
Epoch 9, Train Loss: 66535.5601
Epoch 10, Train Loss: 66416.0430
Epoch 11, Train Loss: 66267.5904
Epoch 12, Train Loss: 66549.6707
Epoch 13, Train Loss: 66995.2303
Epoch 14, Train Loss: 66531.6238
Epoch 15, Train Loss: 66251.5559
Epoch 16, Train Loss: 66379.2739
Epoch 17, Train Loss: 66590.5506
Epoch 18, Train Loss: 66282.7896
Epoch 19, Train Loss: 66118.0736
Epoch 20, Train Loss: 66647.5482
Epoch 21, Train Loss: 65835.2086
Epoch 22, Train Loss: 65680.3213
Epoch 23, Train Loss: 66474.3029
Epoch 24, Train Loss: 65795.5825
Epoch 25, Train Loss: 66294.9482
Epoch 26, Train Loss: 66271.4397
Epoch 27, Train Loss: 66292.7539
Epoch 28, Train Loss: 65987.2856
Epoch 29, Train Loss: 65846.9943
Epoch 30, Train Loss: 65418.4637
Epoch 31, Train Lo

correct error and improve RMSE