<a href="https://colab.research.google.com/github/Michaelmvh/bozdag-research-GNN/blob/main/Baseline_Graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [12]:
# pip installs
!pip install torch-geometric



In [13]:
import pandas as pd
import networkx as nx
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn.models import GCN
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import from_networkx
from sklearn.metrics import roc_auc_score

# Setup

## Constants

In [14]:
graph_location = "/content/drive/MyDrive/Colab Notebooks/Bozdag Research/Data/ChChSe-Decagon_polypharmacy.csv"

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"
print(device)

cpu


# Load Dataset

In [16]:
def load_graph_from_edge_list(file_path:str, ignore_edge_type: bool = True):
  df = pd.read_csv(file_path)
  return nx.from_pandas_edgelist(df=df, source='# STITCH 1', target='STITCH 2', edge_attr= None if ignore_edge_type else True)

nx_data = load_graph_from_edge_list(graph_location)
print(nx_data)

Graph with 645 nodes and 63473 edges


In [17]:
# This will print 2x as many edges as networkx representation because of how
# (un)directed edges are handled
data = from_networkx(nx_data)
print(data)

Data(edge_index=[2, 126946], num_nodes=645)


## Dataset information

In [18]:
def print_graph_info(G: nx.Graph):
  clustering_coeff = nx.average_clustering(G)

  def avg_degree(G:nx.graph):
    return nx.number_of_edges(G)/nx.number_of_nodes(G)*2

  print(f"Average Degree: {avg_degree(G)}\nClustering Coefficient: {clustering_coeff}")
  # nx.draw(nx_data)
  nx.draw(G)

  # print_graph_info(nx_data)

## Define Node Features

### One-hot encoding

In [19]:
num_nodes = nx_data.number_of_nodes()
data.x = torch.eye(num_nodes)

In [20]:
print(data)

Data(edge_index=[2, 126946], num_nodes=645, x=[645, 645])


### Train/test/val split

In [21]:
# Traian, val, test split
transform = RandomLinkSplit(
    is_undirected=True,
    add_negative_train_samples=True,
    split_labels=True)
train_data, val_data, test_data = transform(data)

In [22]:
print("Train: ",train_data)
print("Val: ",val_data)
print("Test: ",test_data)

Train:  Data(edge_index=[2, 88864], num_nodes=645, x=[645, 645], pos_edge_label=[44432], pos_edge_label_index=[2, 44432], neg_edge_label=[44432], neg_edge_label_index=[2, 44432])
Val:  Data(edge_index=[2, 88864], num_nodes=645, x=[645, 645], pos_edge_label=[6347], pos_edge_label_index=[2, 6347], neg_edge_label=[6347], neg_edge_label_index=[2, 6347])
Test:  Data(edge_index=[2, 101558], num_nodes=645, x=[645, 645], pos_edge_label=[12694], pos_edge_label_index=[2, 12694], neg_edge_label=[12694], neg_edge_label_index=[2, 12694])


# Graph Neural Network

## Define Model

In [23]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x  # Embeddings

# Model parameters
input_dim = train_data.x.size(1)
hidden_dim = input_dim * 2
output_dim = 32  # Embedding size

model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

## Define Decoder

In [24]:
def decode(z, edge_index):
    value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=1)
    return value

## Define Training

In [25]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()

    # Compute node embeddings using (only) the training graph.
    z = model(data.x, data.pos_edge_label_index)

    # Positive edge predictions.
    pos_pred = torch.sigmoid(decode(z, data.pos_edge_label_index))
    # Negative edge predictions.
    neg_pred = torch.sigmoid(decode(z, data.neg_edge_label_index))

    # Use binary cross-entropy loss.
    loss = F.binary_cross_entropy(pos_pred, torch.ones_like(pos_pred)) + \
           F.binary_cross_entropy(neg_pred, torch.zeros_like(neg_pred))
    loss.backward()
    optimizer.step()
    return loss.item()

## Define Testing

In [26]:
def test(model, data, return_predictions=False):
    model.eval()
    with torch.no_grad():
        z = model(data.x, data.edge_index)  # Note: use full graph for message passing.

    pos_pred = decode(z, data.pos_edge_label_index)
    neg_pred = decode(z, data.neg_edge_label_index)


    if(return_predictions):
      predictions = torch.cat([pos_pred, neg_pred]).cpu().numpy()
      labels = torch.cat([torch.ones(pos_pred.size(0)), torch.zeros(neg_pred.size(0))]).cpu().numpy()
      return predictions, labels
    else:
      pos_score = torch.sigmoid(pos_pred)
      neg_score = torch.sigmoid(neg_pred)
      return ((pos_score > 0.5).sum() + (neg_score < 0.5).sum()) / (pos_score.size(0) + neg_score.size(0))

In [27]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score

def evaluation(predictions, labels):
  return {
    "Accuracy": accuracy_score(predictions, labels),
    "Precision": precision_score(predictions, labels),
    "Recall": recall_score(predictions, labels),
    "F1": f1_score(labels, predictions),
    "AUC": roc_auc_score(labels, predictions)
  }

In [29]:
for epoch in range(1, 101):  # Train for 100 epochs
    loss = train(model, train_data, optimizer)
    if epoch%10==0:
      val_acc = test(model, val_data)
      print(f'Epoch: {epoch}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}')

Epoch: 10, Loss: 1.3461, Val Acc: 0.5000
Epoch: 20, Loss: 1.2521, Val Acc: 0.5315
Epoch: 30, Loss: 1.1802, Val Acc: 0.6321
Epoch: 40, Loss: 1.1089, Val Acc: 0.5942
Epoch: 50, Loss: 1.0656, Val Acc: 0.5691
Epoch: 60, Loss: 1.0381, Val Acc: 0.5274
Epoch: 70, Loss: 1.0289, Val Acc: 0.5144
Epoch: 80, Loss: 1.0272, Val Acc: 0.4844
Epoch: 90, Loss: 1.0276, Val Acc: 0.4693
Epoch: 100, Loss: 1.0273, Val Acc: 0.4821


In [31]:
predictions, labels = test(model, test_data, return_predictions=True)
metrics_dict = evaluation(predictions, labels)
print(f'Test Metrics:\n {metrics_dict}')

ValueError: Classification metrics can't handle a mix of continuous and binary targets

# Archive - Example Models to compare to later

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x  # Embeddings


def decode(z, edge_index):
    """Compute scores for each edge pair using dot product"""
    return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model parameters
input_dim = train_data.x.size(1)
hidden_dim = 64
output_dim = 32  # Embedding size

model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

train_data = train_data.to(device)

def train(train_data: Tensor):
    model.train()
    optimizer.zero_grad()

    z = model(train_data.x, train_data.edge_index)  # Compute node embeddings

    # Compute scores for positive edges
    pos_score = torch.sigmoid(decode(z, train_data.pos_edge_label_index))
    pos_loss = -torch.log(pos_score + 1e-15).mean()

    # Compute scores for negative edges
    neg_score = torch.sigmoid(decode(z, train_data.neg_edge_label_index))
    neg_loss = -torch.log(1 - neg_score + 1e-15).mean()

    # Combine the losses
    loss = pos_loss + neg_loss
    loss.backward()
    optimizer.step()

    return loss.item()


In [None]:
@torch.no_grad()
def test(data):
    model.eval()

    z = model(data.x, data.edge_index)  # Get embeddings

    pos_score = torch.sigmoid(decode(z, data.pos_edge_label_index))
    neg_score = torch.sigmoid(decode(z, data.neg_edge_label_index))

    # Compute accuracy
    acc = ((pos_score > 0.5).sum() + (neg_score < 0.5).sum()) / (pos_score.size(0) + neg_score.size(0))

    return acc.item()

for epoch in range(1, 101):  # Train for 100 epochs
    loss = train(train_data)
    if epoch%10==0:
      val_acc = test(val_data)
      print(f'Epoch: {epoch}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(test_data)
print(f'Test Accuracy: {test_acc:.4f}')


Epoch: 10, Loss: 0.9779, Val Acc: 0.7149
Epoch: 20, Loss: 0.9778, Val Acc: 0.7129
Epoch: 30, Loss: 0.9796, Val Acc: 0.7146
Epoch: 40, Loss: 0.9770, Val Acc: 0.7155
Epoch: 50, Loss: 0.9769, Val Acc: 0.7147
Epoch: 60, Loss: 0.9799, Val Acc: 0.7155
Epoch: 70, Loss: 0.9817, Val Acc: 0.7158
Epoch: 80, Loss: 0.9766, Val Acc: 0.7152
Epoch: 90, Loss: 0.9802, Val Acc: 0.7158
Epoch: 100, Loss: 0.9878, Val Acc: 0.7149
Test Accuracy: 0.7168


### GAE Example

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 128)
        self.conv2 = GCNConv(128, 64)

    def encode(self,data):
        x = self.conv1(data.x, data.train_pos_edge_index) # convolution 1
        x = x.relu()
        return self.conv2(x, data.train_pos_edge_index) # convolution 2

    def decode(self, z, pos_edge_index, neg_edge_index): # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product
        return logits

    def decode_all(self, z):
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 0).nonzero(as_tuple=False).t() # get predicted edge_list

In [None]:

model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

In [None]:
from torch_geometric.utils import negative_sampling
def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the lenght of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train(data):
    model.train()

    # neg_edge_index = negative_sampling(
    #     edge_index=data.train_pos_edge_index, #positive edges
    #     num_nodes=data.num_nodes, # number of nodes
    #     num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges

    optimizer.zero_grad()

    z = model.encode() #encode
    link_logits = model.decode(z, data.train_pos_edge_index, data.edge_label) # decode

    link_labels = get_link_labels(data.train_pos_edge_index, data.edge_label)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss


@torch.no_grad()
def test(data):
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode() # encode train
        link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid

        link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link

        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs


In [None]:

best_val_perf = test_perf = 0
for epoch in range(1, 101):
    train_loss = train(train_data)
    val_perf, tmp_test_perf = test(test_data)
    if val_perf > best_val_perf:
        best_val_perf = val_perf
        test_perf = tmp_test_perf
    log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    if epoch % 10 == 0:
        print(log.format(epoch, train_loss, best_val_perf, test_perf))



AttributeError: 'GlobalStorage' object has no attribute 'train_pos_edge_index'

In [None]:
def decode(z, edge_index):
    # Compute the inner product for each edge pair
    return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)

def train_link_pred(train_data: Tensor):
    model.train()
    optimizer.zero_grad()
    z = model(train_data.x, train_data.edge_index)
    # Get scores for positive and negative edges
    pos_score = decode(z, train_data.pos_edge_index)
    neg_score = decode(z, train_data.neg_edge_index)
    # Use binary cross-entropy loss, for instance
    pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-15).mean()
    neg_loss = -torch.log(1 - torch.sigmoid(neg_score) + 1e-15).mean()
    loss = pos_loss + neg_loss
    loss.backward()
    optimizer.step()
    return loss.item()


In [None]:
def test(mask):
    model.eval()  # In evaluation mode, dropout is deactivated
    _, pred = model(data.x, data.edge_index).max(dim=1)
    correct = pred[mask].eq(data.y[mask]).sum().item()
    acc = correct / mask.sum().item()
    return acc

for epoch in range(1, 201):
    loss = train_link_pred(train_data)
    train_acc = test(data.train_mask)
    val_acc = test(data.val_mask)
    print(f'Epoch: {epoch}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')


AttributeError: 'GlobalStorage' object has no attribute 'pos_edge_index'