In [None]:
!pip install torch torch-geometric -q
!pip install rdkit -q
print("âœ… Installation complete")

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, f1_score, precision_score, recall_score
)
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
BASE_PATH = '/content/drive/MyDrive/MLHygnn/BaseLine/GCN_Chemical/'
SMILES_FILE = '/content/drive/MyDrive/MLHygnn/BaseLine/Drugs_with_Smiles.csv'

config = {
    'learning_rate': 0.005,
    'hidden_units': 128,
    'dropout': 0.5,
    'num_epochs': 500,
    'patience': 200,
    'seed': 42
}

torch.manual_seed(config['seed'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" Using device: {device}")

In [None]:
# ============================================================================
# LOAD ALL DATA SPLITS (SAME AS YOUR MODEL)
# ============================================================================
print("Loading data splits...")

train_pos = pd.read_csv(f'{BASE_PATH}data/train_postive.csv')
train_neg = pd.read_csv(f'{BASE_PATH}data/train_negatives.csv')
test_pos = pd.read_csv(f'{BASE_PATH}data/test_postive.csv')
test_neg = pd.read_csv(f'{BASE_PATH}data/test_negatives.csv')
val_pos = pd.read_csv(f'{BASE_PATH}data/val_postive.csv')
val_neg = pd.read_csv(f'{BASE_PATH}data/val_negatives.csv')

smiles_df = pd.read_csv(SMILES_FILE)
smiles_dict = dict(zip(smiles_df['DrugBank_ID'], smiles_df['SMILES']))

print(f" Data loaded:")
print(f"   Train positive: {len(train_pos)}, negative: {len(train_neg)}")
print(f"   Val positive: {len(val_pos)}, negative: {len(val_neg)}")
print(f"   Test positive: {len(test_pos)}, negative: {len(test_neg)}")

In [None]:

# ============================================================================
# CREATE DRUG-TO-INDEX MAPPING AND NODE FEATURES
# ============================================================================
all_drugs = list(smiles_dict.keys())
drug_to_idx = {drug: idx for idx, drug in enumerate(all_drugs)}
num_drugs = len(all_drugs)

print(f"Total drugs: {num_drugs}")

# FAIR COMPARISON: Use one-hot identity features like your HyGNN
x = torch.eye(num_drugs, dtype=torch.float).to(device)
print(f" Node features (one-hot): {x.shape}")

# ============================================================================
# CREATE GRAPH EDGES FOR ALL SPLITS
# ============================================================================
def create_edge_index_and_labels(pos_df, neg_df, drug_mapping, directed=False):
    """Create edge index and labels from positive and negative samples"""
    edges = []
    labels = []

    # Positive edges
    for _, row in pos_df.iterrows():
        src = drug_mapping[row['Drug1_ID']]
        dst = drug_mapping[row['Drug2_ID']]
        edges.append([src, dst])
        labels.append(1)
        if directed:  # Add reverse for undirected graph
            edges.append([dst, src])
            labels.append(1)

    # Negative edges
    for _, row in neg_df.iterrows():
        src = drug_mapping[row['Drug1_ID']]
        dst = drug_mapping[row['Drug2_ID']]
        edges.append([src, dst])
        labels.append(0)
        if directed:
            edges.append([dst, src])
            labels.append(0)

    edge_index = torch.tensor(edges, dtype=torch.long).t().to(device)
    labels = torch.tensor(labels, dtype=torch.float).to(device)

    return edge_index, labels

print("Building graphs...")

# Training graph (undirected - same as your approach)
train_edge_index, train_labels = create_edge_index_and_labels(
    train_pos, train_neg, drug_to_idx, directed=True
)

# Validation graph (directed for evaluation)
val_edge_index, val_labels = create_edge_index_and_labels(
    val_pos, val_neg, drug_to_idx, directed=False
)

# Test graph (directed for evaluation)
test_edge_index, test_labels = create_edge_index_and_labels(
    test_pos, test_neg, drug_to_idx, directed=False
)

print(f"âœ… Graph construction complete:")
print(f"   Training edges: {train_edge_index.shape[1]} (undirected)")
print(f"   Validation edges: {val_edge_index.shape[1]} (directed)")
print(f"   Test edges: {test_edge_index.shape[1]} (directed)")


âœ… Total drugs: 1709
âœ… Node features (one-hot): torch.Size([1709, 1709])
Building graphs...
âœ… Graph construction complete:
   Training edges: 614004 (undirected)
   Validation edges: 38374 (directed)
   Test edges: 38378 (directed)


In [None]:

# ============================================================================
# MODEL DEFINITION
# ============================================================================
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout):
        super().__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        return x

class LinkPredictor(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.W2 = nn.Linear(hidden_dim, 1)

    def forward(self, z, edge_index):
        src = z[edge_index[0]]
        dst = z[edge_index[1]]
        h = torch.cat([src, dst], dim=1)
        h = F.relu(self.W1(h))
        return self.W2(h).squeeze()

model = GCN(num_drugs, config['hidden_units'], config['dropout']).to(device)
predictor = LinkPredictor(config['hidden_units']).to(device)

optimizer = torch.optim.Adam(
    list(model.parameters()) + list(predictor.parameters()),
    lr=config['learning_rate']
)

print("âœ… Model created")


In [None]:
import psutil
import os
def calculate_ram_usage():
    """Calculate current RAM usage in GB"""
    process = psutil.Process(os.getpid())
    ram_gb = process.memory_info().rss / (1024 ** 3)  # Convert to GB
    return ram_gb

In [None]:

# ============================================================================
# TRAINING WITH VALIDATION AND EARLY STOPPING
# ============================================================================
print("\n" + "="*80)
print("TRAINING GCN WITH VALIDATION (FAIR COMPARISON)")
print("="*80)

def compute_loss(pred, labels):
    return F.binary_cross_entropy_with_logits(pred, labels)

def compute_metrics(pred, labels):
    pred_proba = torch.sigmoid(pred).cpu().numpy()
    pred_labels = (pred_proba > 0.5).astype(int)
    true_labels = labels.cpu().numpy()

    accuracy = accuracy_score(true_labels, pred_labels)
    roc_auc = roc_auc_score(true_labels, pred_proba)
    pr_auc = average_precision_score(true_labels, pred_proba)
    f1 = f1_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)

    return accuracy, precision, recall, f1, roc_auc, pr_auc

# Get RAM usage before training
ram_before = calculate_ram_usage()
print(f"ðŸ“Š RAM usage before training: {ram_before:.2f} GB")

best_val_loss = float('inf')
patience_counter = 0
best_epoch = 0
start_time = time.time()

# Store training history
train_losses = []
val_losses = []

for epoch in range(config['num_epochs']):
    # Training phase
    model.train()
    predictor.train()

    optimizer.zero_grad()
    z = model(x, train_edge_index)
    pred = predictor(z, train_edge_index)
    train_loss = compute_loss(pred, train_labels)
    train_loss.backward()
    optimizer.step()

    # Validation phase
    model.eval()
    predictor.eval()
    with torch.no_grad():
        z_val = model(x, train_edge_index)  # Use same training graph for embeddings
        pred_val = predictor(z_val, val_edge_index)
        val_loss = compute_loss(pred_val, val_labels)

    train_losses.append(train_loss.item())
    val_losses.append(val_loss.item())

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_epoch = epoch

        # Save best model
        torch.save({
            'model_state_dict': model.state_dict(),
            'predictor_state_dict': predictor.state_dict(),
            'epoch': epoch,
            'val_loss': val_loss.item()
        }, f'{BASE_PATH}gcn_best_model.pth')
    else:
        patience_counter += 1

    if patience_counter >= config['patience']:
        print(f"Early stopping at epoch {epoch}")
        break

    if epoch % 20 == 0:
        val_accuracy, val_precision, val_recall, val_f1, val_roc_auc, val_pr_auc = compute_metrics(pred_val, val_labels)
        print(f"Epoch {epoch:3d}: Train Loss = {train_loss.item():.4f}, Val Loss = {val_loss.item():.4f} , Best-val-loss: {best_val_loss:.4f}")
       #print(f" Val Acc = {val_accuracy:.4f}, Val ROC-AUC = {val_roc_auc:.4f}")

train_time = time.time() - start_time
# Get RAM usage after training
ram_after = calculate_ram_usage()
ram_used = ram_after - ram_before

print(f"RAM usage before training: {ram_before:.2f} GB")
print(f"RAM usage after training:  {ram_after:.2f} GB")
print(f"RAM used during training:  {ram_used:.2f} GB")
print(f"âœ… Training complete: {train_time/60:.2f} minutes")
print(f"Best epoch: {best_epoch}, Best validation loss: {best_val_loss:.4f}")



TRAINING GCN WITH VALIDATION (FAIR COMPARISON)
ðŸ“Š RAM usage before training: 0.91 GB
Epoch   0: Train Loss = 0.6935, Val Loss = 0.6925 , Best-val-loss: 0.6925
Epoch  20: Train Loss = 0.4832, Val Loss = 0.5136 , Best-val-loss: 0.5114
Epoch  40: Train Loss = 0.4505, Val Loss = 0.4868 , Best-val-loss: 0.4868
Epoch  60: Train Loss = 0.4378, Val Loss = 0.4830 , Best-val-loss: 0.4825
Epoch  80: Train Loss = 0.4315, Val Loss = 0.4777 , Best-val-loss: 0.4777
Epoch 100: Train Loss = 0.4268, Val Loss = 0.4730 , Best-val-loss: 0.4730
Epoch 120: Train Loss = 0.4102, Val Loss = 0.4551 , Best-val-loss: 0.4551
Epoch 140: Train Loss = 0.3692, Val Loss = 0.4017 , Best-val-loss: 0.4017
Epoch 160: Train Loss = 0.3400, Val Loss = 0.3816 , Best-val-loss: 0.3770
Epoch 180: Train Loss = 0.3294, Val Loss = 0.3668 , Best-val-loss: 0.3668
Epoch 200: Train Loss = 0.3242, Val Loss = 0.3616 , Best-val-loss: 0.3616
Epoch 220: Train Loss = 0.3207, Val Loss = 0.3598 , Best-val-loss: 0.3572
Epoch 240: Train Loss = 

In [None]:
# ============================================================================
# FINAL TESTING WITH BEST MODEL
# ============================================================================
print("\n" + "="*80)
print("FINAL TESTING WITH BEST MODEL")
print("="*80)

# Load best model
checkpoint = torch.load(f'{BASE_PATH}gcn_best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
predictor.load_state_dict(checkpoint['predictor_state_dict'])

model.eval()
predictor.eval()

with torch.no_grad():
    z_test = model(x, train_edge_index)
    pred_test = predictor(z_test, test_edge_index)
    pred_proba = torch.sigmoid(pred_test).cpu().numpy()
    pred_labels = (pred_proba > 0.5).astype(int)
    true_labels = test_labels.cpu().numpy()

# Calculate final metrics
accuracy = accuracy_score(true_labels, pred_labels)
roc_auc = roc_auc_score(true_labels, pred_proba)
pr_auc = average_precision_score(true_labels, pred_proba)
f1 = f1_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)

# ============================================================================
# PRINT COMPREHENSIVE RESULTS
# ============================================================================
print("\n" + "="*80)
print("GCN - FINAL RESULTS (FAIR COMPARISON)")
print("="*80)
print(f"Best Epoch:        {best_epoch}")
print(f"Accuracy:          {accuracy:.4f}")
print(f"Precision:         {precision:.4f}")
print(f"Recall:            {recall:.4f}")
print(f"F1-Score:          {f1:.4f}")
print(f"ROC-AUC:           {roc_auc:.4f}")
print(f"PR-AUC:            {pr_auc:.4f}")
print(f"Training Time:     {train_time/60:.2f} minutes")
print(f"Total Epochs:      {epoch}")
print("="*80)

# ============================================================================
# COMPARISON WITH YOUR MODEL'S RESULTS
# ============================================================================
print("\n" + "="*80)
print("COMPARISON WITH YOUR HyGNN MODEL")
print("="*80)
print("Your HyGNN Results:")
print("  Accuracy:  0.9313, Precision: 0.9196, Recall: 0.9469")
print("  F1-Score:  0.9330, ROC-AUC: 0.9842, PR-AUC: 0.9841")
print(f"\\nGCN Results:")
print(f"  Accuracy:  {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}, ROC-AUC: {roc_auc:.4f}, PR-AUC: {pr_auc:.4f}")

# Save detailed results for comparison
results_df = pd.DataFrame({
    'Drug1_ID': test_pos['Drug1_ID'].tolist() + test_neg['Drug1_ID'].tolist(),
    'Drug2_ID': test_pos['Drug2_ID'].tolist() + test_neg['Drug2_ID'].tolist(),
    'True_Label': true_labels,
    'Predicted_Label': pred_labels,
    'Prediction_Score': pred_proba
})

results_df.to_csv(f'{BASE_PATH}gcn_test_predictions.csv', index=False)
print(f"\\nDetailed predictions saved to: {BASE_PATH}gcn_test_predictions.csv")


FINAL TESTING WITH BEST MODEL

GCN - FINAL RESULTS (FAIR COMPARISON)
Best Epoch:        499
Accuracy:          0.8765
Precision:         0.8785
Recall:            0.8740
F1-Score:          0.8762
ROC-AUC:           0.9499
PR-AUC:            0.9516
Training Time:     60.63 minutes
Total Epochs:      499

COMPARISON WITH YOUR HyGNN MODEL
Your HyGNN Results:
  Accuracy:  0.9313, Precision: 0.9196, Recall: 0.9469
  F1-Score:  0.9330, ROC-AUC: 0.9842, PR-AUC: 0.9841
\nGCN Results:
  Accuracy:  0.8765, Precision: 0.8785, Recall: 0.8740
  F1-Score:  0.8762, ROC-AUC: 0.9499, PR-AUC: 0.9516
\nDetailed predictions saved to: /content/drive/MyDrive/MLHygnn/BaseLine/GCN_Chemical/gcn_test_predictions.csv


In [None]:
# Colab/Notebook: System spec snapshot (no extra installs needed)
import os, sys, json, platform, shutil, subprocess, re
from datetime import datetime

def run(cmd):
    try:
        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
        return p.stdout.strip() or p.stderr.strip()
    except Exception as e:
        return f"(error running {' '.join(cmd)}: {e})"

def grep(text, pattern, default=None, flags=re.I):
    m = re.search(pattern, text or "", flags)
    return m.group(1).strip() if m else default

def parse_lscpu():
    out = run(["bash","-lc","lscpu"])
    info = {}
    for line in out.splitlines():
        if ":" in line:
            k,v = line.split(":",1)
            info[k.strip()] = v.strip()
    return out, {
        "model_name": info.get("Model name"),
        "architecture": info.get("Architecture"),
        "cpus": info.get("CPU(s)"),
        "threads_per_core": info.get("Thread(s) per core"),
        "cores_per_socket": info.get("Core(s) per socket"),
        "sockets": info.get("Socket(s)"),
        "base_mhz": info.get("CPU MHz"),
        "max_mhz": info.get("CPU max MHz"),
        "vendor": info.get("Vendor ID"),
    }

def parse_free():
    out = run(["bash","-lc","free -h"])
    total = grep(out, r"Mem:\s+(\S+)", None)
    return out, {"ram_total_human": total}

def parse_df():
    out = run(["bash","-lc","df -h /"])
    # header then root line
    lines = out.splitlines()
    root = lines[1].split() if len(lines) >= 2 else []
    total = root[1] if len(root) >= 2 else None
    avail = root[3] if len(root) >= 4 else None
    return out, {"disk_root_total_human": total, "disk_root_available_human": avail}

def nvidia_smi():
    if shutil.which("nvidia-smi"):
        out = run(["bash","-lc","nvidia-smi -L"])
        full = run(["bash","-lc","nvidia-smi"])
        driver = grep(full, r"Driver Version:\s*([^\s]+)")
        cuda = grep(full, r"CUDA Version:\s*([^\s]+)")
        # Try VRAM for first GPU
        vram = None
        m = re.search(r"(\d+)\s*MiB\s+Total", full)
        if m: vram = f"{m.group(1)} MiB"
        return {"present": True, "list": out, "driver": driver, "cuda_version": cuda, "vram_total": vram, "raw_table": full}
    return {"present": False}

def detect_env():
    in_colab = False
    try:
        import google.colab  # type: ignore
        in_colab = True
    except Exception:
        in_colab = bool(os.environ.get("COLAB_GPU") or os.environ.get("KAGGLE_KERNEL_RUN_TYPE"))
    return {
        "in_colab": in_colab,
        "python": sys.version.split()[0],
        "python_build": " ".join(platform.python_build()),
        "os": platform.system(),
        "os_release": platform.release(),
        "platform": platform.platform(),
        "machine": platform.machine(),
        "timestamp": datetime.utcnow().isoformat() + "Z"
    }

# Collect
lscpu_out, cpu = parse_lscpu()
free_out, ram = parse_free()
df_out, disk = parse_df()
gpu = nvidia_smi()
env = detect_env()

summary = {
    "environment": env,
    "cpu": cpu,
    "ram": ram,
    "disk": disk,
    "gpu": {k:v for k,v in gpu.items() if k != "raw_table"}  # keep raw separately
}

print("=== SUMMARY (JSON) ===")
print(json.dumps(summary, indent=2))

print("\n=== RAW: lscpu ===")
print(lscpu_out)

print("\n=== RAW: free -h ===")
print(free_out)

print("\n=== RAW: df -h / ===")
print(df_out)

print("\n=== RAW: nvidia-smi ===")
print(gpu.get("raw_table","(no GPU / nvidia-smi not found)"))


=== SUMMARY (JSON) ===
{
  "environment": {
    "in_colab": true,
    "python": "3.12.12",
    "python_build": "main Oct 10 2025 08:52:57",
    "os": "Linux",
    "os_release": "6.6.105+",
    "platform": "Linux-6.6.105+-x86_64-with-glibc2.35",
    "machine": "x86_64",
    "timestamp": "2025-10-21T15:04:59.512990Z"
  },
  "cpu": {
    "model_name": "AMD EPYC 7B12",
    "architecture": "x86_64",
    "cpus": "2",
    "threads_per_core": "2",
    "cores_per_socket": "1",
    "sockets": "1",
    "base_mhz": null,
    "max_mhz": null,
    "vendor": "AuthenticAMD"
  },
  "ram": {
    "ram_total_human": "12Gi"
  },
  "disk": {
    "disk_root_total_human": "108G",
    "disk_root_available_human": "69G"
  },
  "gpu": {
    "present": false
  }
}

=== RAW: lscpu ===
Architecture:                            x86_64
CPU op-mode(s):                          32-bit, 64-bit
Address sizes:                           48 bits physical, 48 bits virtual
Byte Order:                              Little Endian

  "timestamp": datetime.utcnow().isoformat() + "Z"
