In [1]:
########## INSTALL TORCH GEOMETRIC ##################
# https://pytorch-geometric.readthedocs.io/en/latest/
#####################################################
import torch


def format_pytorch_version(version):
    return version.split("+")[0]


TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)


def format_cuda_version(version):
    return "cu" + version.replace(".", "")


CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_scatter-2.1.2%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt21cu121
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_sparse-0.6.18%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt21cu121
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-cluster
 

In [16]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv, SAGEConv
from torch_geometric.loader import NeighborLoader, LinkNeighborLoader, LinkLoader, NodeLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
node_information = pd.read_csv('node_information.csv', header=None, index_col=0)
train_set = pd.read_csv('train.txt', header=None, sep=' ')
train_set.columns = ['node1', 'node2', 'edge']
test_set = pd.read_csv('test.txt', header=None, sep=' ')
test_set.columns = ['node1', 'node2']

mapping = {node_information.index[i]: i for i in range(len(node_information))}
node_information.index = node_information.index.map(mapping)
train_set["node1"] = train_set["node1"].map(mapping)
train_set["node2"] = train_set["node2"].map(mapping)
test_set["node1"] = test_set["node1"].map(mapping)
test_set["node2"] = test_set["node2"].map(mapping)

In [None]:
train_set["edge"].value_counts()

1    5248
0    5248
Name: edge, dtype: int64

In [23]:
def split_dataset(N, train_ratio, seed=4):
    """ Creates train/val/test masks

    Args:
        N (int): dataset size
        train_ratio (float): proportion of the training set
        seed (int, optional): Fixes random. Defaults to 4

    Return:
        [tensors]: returns boolean tensors for train/val/test set
        True indicates that a node belong to this set, False otherwise
    """

    train_size = int(train_ratio * N)
    val_size = N - train_size

    # split dataset
    subsets = torch.utils.data.random_split(range(N), lengths = [train_size, val_size, 0], generator=torch.Generator().manual_seed(seed))
    train_inds, val_inds, test_inds = [torch.Tensor(subset.indices) for subset in subsets]

    # create tensors of masks for each subset
    dataset_inds = torch.arange(N)
    train_mask = torch.isin(dataset_inds, train_inds)
    val_mask = torch.isin(dataset_inds, val_inds)
    test_mask = torch.isin(dataset_inds, test_inds)

    return train_mask, val_mask

train_mask, val_mask = split_dataset(train_set.shape[0], train_ratio=0.8)

In [33]:
degree_per_nodes = np.array([train_set[pd.Series(train_mask)].loc[(train_set["node1"] == i) | (train_set["node2"] == i), "edge"].sum() for i in range(len(node_information))])
degree_per_nodes_standardized = StandardScaler().fit_transform(degree_per_nodes.reshape(-1,1))

node_information[933] = degree_per_nodes_standardized

In [187]:
class GATModel(nn.Module):
  def __init__(self, input_size, hidden_size, hidden_size_bis, output_size_embed):
    super().__init__()
    # self.conv1 = GATv2Conv(input_size, hidden_size, heads=6)
    # self.conv2 = GATv2Conv(6 * hidden_size, hidden_size, heads=4)
    # self.conv3 = GATv2Conv(4 * hidden_size, hidden_size_bis, heads=4)
    # self.conv4 = GATv2Conv(4 * hidden_size_bis, output_size_embed, heads=6, concat=False)

    self.conv1 = SAGEConv(input_size, hidden_size, aggr="max")
    self.conv2 = SAGEConv(hidden_size, hidden_size_bis, aggr="mean")
    self.conv3 = SAGEConv(hidden_size_bis, hidden_size_bis//2, aggr="max")
    self.conv4 = SAGEConv(hidden_size_bis//2, hidden_size_bis//2, aggr="mean")
    self.conv5 = SAGEConv(hidden_size_bis//2, output_size_embed, aggr="max")

    self.lin1 = nn.Linear(2 * output_size_embed, output_size_embed)
    self.lin2 = nn.Linear(output_size_embed, output_size_embed)
    self.lin3 = nn.Linear(output_size_embed, output_size_embed//2)
    self.lin4 = nn.Linear(output_size_embed//2, 2)

  def forward(self, x, edge_index):
    x = self.conv1(x, edge_index)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.conv2(x, edge_index)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.conv3(x, edge_index)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.conv4(x, edge_index)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.conv5(x, edge_index)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)

    row, col = edge_index
    x = torch.cat([x[row], x[col]], dim=1)
    x = self.lin1(x)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.lin2(x)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.lin3(x)
    x = nn.functional.elu(x)
    x = nn.functional.dropout(x, p=0.5)
    x = self.lin4(x)

    return x

In [36]:
def evaluate(model, loss_fcn, device, dataloader):
    score_list_batch = []

    model.eval()
    for i, batch in enumerate(dataloader):
        batch = batch.to(device)
        output = model(batch.x, batch.edge_index)
        loss_test = loss_fcn(output, batch.y.float())
        predict = np.where(output.detach().cpu().numpy()[:, 1] >= 0, 1, 0)
        score = accuracy_score(batch.y.cpu().numpy()[:, 1], predict)
        score_list_batch.append(score)

    return np.array(score_list_batch).mean()

In [69]:
def train(model, loss_fcn, device, optimizer, max_epochs, train_dataloader, val_dataloader):

    epoch_list = []
    scores_list = []

    # loop over epochs
    for epoch in range(max_epochs):
        model.train()
        losses = []
        # loop over batches
        for i, train_batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            train_batch_device = train_batch.to(device)
            # logits is the output of the model
            output = model(train_batch_device.x, train_batch_device.edge_index)
            # compute the loss
            loss = loss_fcn(output, train_batch_device.y.float())
            # optimizer step
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        loss_data = np.array(losses).mean()
        print("Epoch {:05d} | Loss: {:.4f}".format(epoch + 1, loss_data))

        if epoch % 5 == 0:
            # evaluate the model on the validation set
            # computes the f1-score (see next function)
            score = evaluate(model, loss_fcn, device, val_dataloader)
            print("Accuracy Score: {:.4f}".format(score))
            scores_list.append(score)
            epoch_list.append(epoch)

    return epoch_list, scores_list

In [79]:
x = torch.tensor(node_information.values, dtype=torch.float)

y_train = torch.tensor(train_set['edge'].values[train_mask], dtype=torch.long)
y_train = torch.tensor([[1 if y_train[i]==0 else 0, 1 if y_train[i]==1 else 0] for i in range(len(y_train))])
edge_index_train = torch.tensor(train_set[['node1', 'node2']].values.T[torch.vstack([train_mask, train_mask])].reshape(2, -1), dtype=torch.long)

y_val = torch.tensor(train_set['edge'].values[val_mask], dtype=torch.long)
y_val = torch.tensor([[1 if y_val[i]==0 else 0, 1 if y_val[i]==1 else 0] for i in range(len(y_val))])
edge_index_val = torch.tensor(train_set[['node1', 'node2']].values.T[torch.vstack([val_mask, val_mask])].reshape(2, -1), dtype=torch.long)

edge_index_test = torch.tensor(test_set[['node1', 'node2']].values.T, dtype=torch.long)

data_train = Data(x=x, edge_index=edge_index_train, y=y_train)
data_val = Data(x=x, edge_index=edge_index_val, y=y_val)


batch_size = 32
train_data_loader = NeighborLoader(data_train, num_neighbors=[-1], batch_size=batch_size, shuffle=True)
val_data_loader = NeighborLoader(data_val, num_neighbors=[-1], batch_size=batch_size, shuffle=True)



In [188]:
### DEVICE GPU OR CPU : will select GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\nDevice: ", device)

# Create the GAT model
model = GATModel(input_size=x.shape[1], hidden_size=500, hidden_size_bis=400, output_size_embed=300).to(device)

# Define the loss function and optimizer
loss_fcn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.005)

max_epochs=200


Device:  cuda


In [None]:
# Train with mini-batches
epoch_list, model_scores = train(
    model,
    loss_fcn,
    device,
    optimizer,
    max_epochs,
    train_data_loader,
    val_data_loader,
)

In [67]:
# Evaluation on val set batch

score_list_batch = []
predict_list_batch = []
y_list_batch = []

model.eval()
for i, batch in enumerate(val_data_loader):
    batch = batch.to(device)
    output = model(batch.x, batch.edge_index)
    predict = np.where(output.detach().cpu().numpy() >= 0, 1, 0)
    score = accuracy_score(batch.y.cpu().numpy()[:, 1], predict[:, 1])

    score_list_batch.append(score)
    predict_list_batch.append(predict)
    y_list_batch.append(batch.y.cpu().numpy()[:, 1].mean())

In [68]:
np.array((score_list_batch)).mean()

0.635291542545112

In [191]:
# Train without mini-batches
data_train_device = data_train.to(device)
data_val_device = data_val.to(device)

for epoch in range(max_epochs):
    model.train()
    optimizer.zero_grad()
    out = model(data_train_device.x, data_train_device.edge_index)
    loss = loss_fcn(out, data_train_device.y.float())
    loss.backward()
    optimizer.step()

    print("Epoch {:05d} | Loss: {:.4f}".format(epoch + 1, loss.item()))


    model.eval()
    output = model(data_val_device.x, data_val_device.edge_index)
    loss_test = loss_fcn(output, data_val_device.y.float())
    predict = np.where(output.detach().cpu().numpy()[:, 1] >= 0.5, 1, 0)
    score = accuracy_score(data_val_device.y.cpu().numpy()[:, 1], predict)
    print("Accuracy core: {:.4f}".format(score))

Epoch 00001 | Loss: 0.4077
Accuracy core: 0.6567
Epoch 00002 | Loss: 0.4155
Accuracy core: 0.6619
Epoch 00003 | Loss: 0.4213
Accuracy core: 0.6614
Epoch 00004 | Loss: 0.4268
Accuracy core: 0.6605
Epoch 00005 | Loss: 0.4148
Accuracy core: 0.6629
Epoch 00006 | Loss: 0.4242
Accuracy core: 0.6643
Epoch 00007 | Loss: 0.4163
Accuracy core: 0.6643
Epoch 00008 | Loss: 0.4213
Accuracy core: 0.6705
Epoch 00009 | Loss: 0.4162
Accuracy core: 0.6605
Epoch 00010 | Loss: 0.4096
Accuracy core: 0.6567
Epoch 00011 | Loss: 0.4176
Accuracy core: 0.6562
Epoch 00012 | Loss: 0.4082
Accuracy core: 0.6600
Epoch 00013 | Loss: 0.4162
Accuracy core: 0.6657
Epoch 00014 | Loss: 0.4126
Accuracy core: 0.6610
Epoch 00015 | Loss: 0.4112
Accuracy core: 0.6638
Epoch 00016 | Loss: 0.4126
Accuracy core: 0.6710
Epoch 00017 | Loss: 0.4081
Accuracy core: 0.6643
Epoch 00018 | Loss: 0.4154
Accuracy core: 0.6605
Epoch 00019 | Loss: 0.4054
Accuracy core: 0.6671
Epoch 00020 | Loss: 0.4123
Accuracy core: 0.6652
Epoch 00021 | Loss: 

In [192]:
model.eval()
output = model(x.to(device), edge_index_test.to(device))
pred_test = np.where(output.detach().cpu().numpy() >= 0, 1, 0)[:, 1]

In [195]:
pred_test

array([0, 0, 0, ..., 1, 0, 0])

In [197]:
preds = zip(np.array(range(len(test_set))), pred_test)

In [199]:
import os
import csv

os.makedirs("models", exist_ok=True)
i = len(os.listdir("models")) +1
model_path = f"models/model_{i}.csv"

with open(model_path,"w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(i for i in ["ID", "Predicted"])
    for row in preds:
         csv_out.writerow(row)
    pred.close()
