In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install torch-geometric optuna torchmetrics

Collecting torch-geometric
  Downloading torch_geometric-2.5.0-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading torch_geometric-2.5.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.0


In [2]:
import torch
chunk1 = torch.load('/kaggle/input/sci-data-graph/first.pt')
chunk2 = torch.load('/kaggle/input/sci-data-graph/second (1).pt')
chunk1+=chunk2
del chunk2

In [3]:
from sklearn.model_selection import train_test_split
rand_seed = 42
X_train, X_test = train_test_split(chunk1, test_size=0.1, random_state = rand_seed)
X_train, X_val = train_test_split(X_train, test_size=0.1, random_state = rand_seed)
print(len(X_train), len(X_val), len(X_val))

14580 1620 1620


In [4]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(X_train, batch_size=32, shuffle=True)
val_loader = DataLoader(X_val, batch_size=32, shuffle=False)
test_loader = DataLoader(X_test, batch_size=32, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import SAGEConv, global_mean_pool
from torch_geometric.data import DataLoader
from torchmetrics.classification import BinaryAUROC
from torchmetrics import AUROC
import optuna


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


auroc = BinaryAUROC()

class Network(nn.Module):
    def __init__(self, c_in, c_hidden, c_out, p=0.3):
        super(Network, self).__init__()
        torch.manual_seed(123)
        self.conv1 = SAGEConv(c_in, c_hidden, aggr='mean')
        self.conv2 = SAGEConv(c_hidden, 3*c_hidden, aggr='mean')
        self.conv3 = SAGEConv(3*c_hidden, 2*c_hidden, aggr='mean')
        self.conv4 = SAGEConv(2*c_hidden, c_hidden, aggr='mean')
        self.lin1 = nn.Linear(c_hidden, 4*c_out)
        self.lin2 = nn.Linear(4*c_out, c_out)
        self.p = p

    def forward(self, x, edge_index, batch, is_train):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.conv4(x, edge_index)
        x = global_mean_pool(x, batch)

        x = F.dropout(x, p=self.p, training=is_train)
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=self.p, training=is_train)
        x = self.lin2(x)

        return x

def objective(trial):
    
    c_hidden = 32
    p = trial.suggest_uniform('p', 0.1, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


    model = Network(c_in=5, c_hidden=c_hidden, c_out=2, p=p).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    num_epochs = 10  
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
#         print(epoch)

        for idx, batch in enumerate(train_loader):
            batch = batch.to(device)

            pred = model(batch.x.float(), batch.edge_index, batch.batch, True)
            target = F.one_hot(batch.y, 2).float()
            loss = criterion(pred, target)
            epoch_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        #     break
        # break

    # Evaluate on the validation set
    _, _, val_auroc = evaluate(val_loader, model, criterion)

    return val_auroc

def evaluate(loader, model, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            batch.to(device)
            pred = model(batch.x.float(), batch.edge_index, batch.batch, False)
            target = F.one_hot(batch.y, 2).float()
            loss = criterion(pred, target)
            total_loss += loss.item()

            # Calculate accuracy
            pred_labels = torch.softmax(pred, -1).argmax(dim=-1)
            correct += (pred_labels == batch.y).sum().item()
            total_samples += len(batch.y)
            all_labels.append(batch.y)
            all_preds.append(pred_labels)

    pred = all_preds[0]
    label = all_labels[0]

    for p, l in zip(all_preds[1:], all_labels[1:]):
        pred = torch.cat([pred, p])
        label = torch.cat([label, l])
#     print(pred, label)
    return total_loss / len(loader), correct / total_samples, auroc(pred.cpu(), label.cpu())

# Optimizing hyperparameters with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=80)  


print('Best trial:')
trial = study.best_trial
print('Value: {}'.format(trial.value))
print('Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


best_p = trial.params['p']
best_learning_rate = trial.params['learning_rate']

final_model = Network(c_in=5, c_hidden=64, c_out=2, p=best_p).to(device)
final_optimizer = optim.Adam(final_model.parameters(), lr=best_learning_rate)
final_criterion = nn.BCEWithLogitsLoss()

num_epochs = 100 

for epoch in range(num_epochs):
    final_model.train()
    epoch_loss = 0

    for idx, batch in enumerate(train_loader):
        batch = batch.to(device)

        pred = final_model(batch.x.float(), batch.edge_index, batch.batch, True)
        target = F.one_hot(batch.y, 2).float()
        loss = final_criterion(pred, target)
        epoch_loss += loss.item()

        final_optimizer.zero_grad()
        loss.backward()
        final_optimizer.step()

# Evaluate on the test set
test_loss, test_accuracy, test_auroc = evaluate(test_loader, final_model, final_criterion)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test AUROC: {test_auroc:.4f}')


[I 2024-03-09 20:36:02,213] A new study created in memory with name: no-name-76ab11f1-ae36-4f1a-a547-6a739eb89974
  p = trial.suggest_uniform('p', 0.1, 0.5)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
[I 2024-03-09 20:37:33,189] Trial 0 finished with value: 0.6842309832572937 and parameters: {'p': 0.1982082705778666, 'learning_rate': 4.102608508754551e-05}. Best is trial 0 with value: 0.6842309832572937.
[I 2024-03-09 20:39:04,119] Trial 1 finished with value: 0.7005489468574524 and parameters: {'p': 0.33457499616056496, 'learning_rate': 9.583443909648773e-05}. Best is trial 1 with value: 0.7005489468574524.
[I 2024-03-09 20:40:36,316] Trial 2 finished with value: 0.7230759859085083 and parameters: {'p': 0.1281157538412595, 'learning_rate': 0.0003808361298915004}. Best is trial 2 with value: 0.7230759859085083.
[I 2024-03-09 20:42:07,756] Trial 3 finished with value: 0.6540194153785706 and parameters: {'p': 0.12884084067762905, 'learning_rate': 1.09199620518