<a href="https://colab.research.google.com/github/John1495/RNA-3D/blob/main/mODEL_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
!pip install torch torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.1.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.1.0+cpu.html


In [None]:
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv
from tqdm import tqdm
import os

# === Load Data ===
seq_df = pd.read_csv('/kaggle/cleaned_train_sequences2 (1).csv')
label_df = pd.read_csv('/kaggle/train_labels1.csv')
label_df['resname'] = label_df['resname'].str.extract(r'([AUGC])')
label_df = label_df.dropna(subset=['resname'])
label_df['target_id'] = label_df['ID'].str.extract(r'(.+)_\d+')
merged = pd.merge(label_df, seq_df[['target_id', 'sequence']], on='target_id', how='left')
complete_ids = merged.groupby('target_id')['resid'].count()
valid_ids = complete_ids[complete_ids > 10].index
merged = merged[merged['target_id'].isin(valid_ids)]
all_ids = merged['target_id'].unique()
train_ids, val_ids = train_test_split(all_ids, test_size=0.1, random_state=42)
residue_mapping = {'A': 0, 'U': 1, 'G': 2, 'C': 3}

# === Graph Construction ===
def create_graph_from_group(df_group, scaler=None, fit_scaler=False):
    df_group = df_group.sort_values('resid')
    sequence = df_group['sequence'].values[0]
    coords = df_group[['x_1', 'y_1', 'z_1']].values
    if scaler is not None:
        if fit_scaler:
            coords = scaler.fit_transform(coords)
        else:
            coords = scaler.transform(coords)
    node_features = torch.eye(4)[[residue_mapping[r] for r in df_group['resname']]]
    pos = torch.tensor(coords, dtype=torch.float)
    y = pos.clone()
    n = len(df_group)
    edge_index = torch.tensor(
        [[i, i+1] for i in range(n-1)] + [[i+1, i] for i in range(n-1)],
        dtype=torch.long
    ).t().contiguous()
    return Data(x=node_features, edge_index=edge_index, pos=pos, y=y)

def prepare_data(train_ids, val_ids, merged):
    scaler = StandardScaler()
    train_graphs, val_graphs = [], []
    for tid in train_ids:
        g = create_graph_from_group(merged[merged['target_id'] == tid], scaler, fit_scaler=True)
        train_graphs.append(g)
    for tid in val_ids:
        g = create_graph_from_group(merged[merged['target_id'] == tid], scaler, fit_scaler=False)
        val_graphs.append(g)
    return train_graphs, val_graphs

# === GAT Model ===
class GAT(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads, dropout):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=num_heads, dropout=dropout)
        self.conv2 = GATConv(hidden_dim * num_heads, output_dim, dropout=dropout)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# === Evaluation Metrics ===
def evaluate(model, val_loader, device):
    model.eval()
    predictions, true = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            pred_coords = model(batch)
            predictions.append(pred_coords.cpu().numpy())
            true.append(batch.y.cpu().numpy())
    predictions = np.concatenate(predictions, axis=0)
    true = np.concatenate(true, axis=0)
    rmse = np.sqrt(mean_squared_error(true, predictions))
    mae = mean_absolute_error(true, predictions)
    rmsd = np.sqrt(np.mean((true - predictions) ** 2))
    tm_score = np.mean(np.exp(-np.linalg.norm(true - predictions, axis=1) / (0.5 * true.shape[0])))
    return rmse, mae, rmsd, tm_score

# === Optuna Objective ===
def objective(trial):
    hidden_dim = trial.suggest_categorical('hidden_dim', [16, 32, 64, 128])
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    num_heads = trial.suggest_categorical('num_heads', [2, 4, 8])
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)

    train_graphs, val_graphs = prepare_data(train_ids, val_ids, merged)
    train_loader = DataLoader(train_graphs, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_graphs, batch_size=1)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GAT(input_dim=4, hidden_dim=hidden_dim, output_dim=3, num_heads=num_heads, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = nn.MSELoss()

    # Training
    model.train()
    for epoch in range(50):
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            pred = model(batch)
            loss = loss_fn(pred, batch.y)
            loss.backward()
            optimizer.step()

    rmse, mae, rmsd, tm_score = evaluate(model, val_loader, device)
    trial.set_user_attr("mae", mae)
    trial.set_user_attr("tm_score", tm_score)
    trial.set_user_attr("rmsd", rmsd)
    return rmse

# === Run Study ===
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

# === Save Best Model and Params ===
print("Best trial:")
print("  RMSE:", study.best_value)
print("  Params:", study.best_params)
print("  MAE:", study.best_trial.user_attrs['mae'])
print("  TM-Score:", study.best_trial.user_attrs['tm_score'])
print("  RMSD:", study.best_trial.user_attrs['rmsd'])

with open("best_gat_params.json", "w") as f:
    import json
    json.dump(study.best_params, f, indent=2)


[I 2025-05-02 18:25:32,715] A new study created in memory with name: no-name-c58a3cf7-11d0-4efa-b46a-a72925bf1dc9
[I 2025-05-02 18:30:46,404] Trial 0 finished with value: 27.467933408297323 and parameters: {'hidden_dim': 64, 'dropout': 0.49809328638599326, 'num_heads': 4, 'lr': 0.0005687892357420013}. Best is trial 0 with value: 27.467933408297323.
[I 2025-05-02 18:37:20,576] Trial 1 finished with value: 27.45724770730228 and parameters: {'hidden_dim': 128, 'dropout': 0.17435068678201587, 'num_heads': 4, 'lr': 0.0012286672771223048}. Best is trial 1 with value: 27.45724770730228.
[I 2025-05-02 18:41:11,119] Trial 2 finished with value: 27.46503125602684 and parameters: {'hidden_dim': 16, 'dropout': 0.2000032847691481, 'num_heads': 2, 'lr': 0.0002746661122718814}. Best is trial 1 with value: 27.45724770730228.
