In [1]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

!pip install torchviz

2.0.1+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
Collecting torchviz
  Downloading torchviz-0.0.2.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... [?25l[?25hdone
  Created wheel for torchviz: filename=torchviz-0.0.2-py3-none-any.whl size=4130 sha256=210003f7b99e11ff13d1bbb08af739bafeeba2c13cb7d13f0578e04693d1c9c1
  Stored in directory: /root/.cache/pip/wheels/4c/97/88/a02973217949e0db0c9f4346d154085f4725f99c4f15a87094
Success

In [2]:
def hash_data(subgraph):
    x = str(subgraph.x.cpu().numpy().tolist())
    y = str(subgraph.y.cpu().numpy().tolist())
    edge_index = str(subgraph.edge_index.cpu().numpy().tolist())
    edge_types = str(subgraph.edge_types.cpu().numpy().tolist())
    return x + '\n' + y + '\n' + edge_index + '\n' + edge_types

In [3]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Dataset, download_url
import pickle
from torch_geometric.loader import DataLoader
'''This class needs to be declared to load in our own created datasets'''
class MyOwnDataset(Dataset):
    def __init__(self, data, transform=None, pre_transform=None, num_relations=None):
        super(MyOwnDataset, self).__init__('', transform, pre_transform)
        self.data_list = data
        self.num_relations = num_relations

    @property
    def raw_file_names(self):
        return []  # This dataset has no raw files

    @property
    def processed_file_names(self):
        return []  # This dataset has no processed files

    def download(self):
        pass  # This dataset has no download

    def process(self):
        pass  # This dataset doesn't need processing

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]


# with open('/content/drive/MyDrive/gcn_subgraph_data_filtered/scierc_OR.pkl', 'rb') as f:
#   dataset = pickle.load(f)

In [4]:
def print_class_imbalance(dataset, additional_str=''):
    class_count = [0, 0]  # Assuming binary classes 0 and 1

    for data in dataset:
        # Assuming 'y' is the label attribute in your data
        # Increment the count of the corresponding class
        for label in data.y:
            class_count[int(label.item())] += 1

    total_samples = sum(class_count)
    class_proportions = [count / total_samples for count in class_count]

    print(f"Class Imbalance {additional_str}:")
    print("================")
    print(f"Class 0: {class_count[0]} samples ({class_proportions[0]*100:.2f}%)")
    print(f"Class 1: {class_count[1]} samples ({class_proportions[1]*100:.2f}%)")

In [5]:
def print_dataset_stats(dataset):
  print()
  print(f'Dataset: {dataset}:')
  print('====================')
  print(f'Number of graphs: {len(dataset)}')
  print(f'Number of features: {dataset.num_features}')
  print(f'Number of classes: {dataset.num_classes}')
  print(f'Number of relations: {dataset.num_relations}')

  data = dataset[0]  # Get the first graph object.

  print()
  print(data)
  print('=============================================================')

  # Gather some statistics about the first graph.
  print(f'Number of nodes: {data.num_nodes}')
  print(f'Number of edges: {data.num_edges}')
  print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
  print(f'Has isolated nodes: {data.has_isolated_nodes()}')
  print(f'Has self-loops: {data.has_self_loops()}')
  print(f'Is undirected: {data.is_undirected()}')
  print_class_imbalance(dataset)

  duplicate_check = set()
  for data in dataset:
    assert data.edge_index.max() < data.num_nodes, 'Edge node index is larger than number of nodes.'

  for data in dataset:
    assert hash_data(data) not in duplicate_check, 'Duplicate found'
    duplicate_check.add(hash_data(data))


# print_dataset_stats(dataset)

In [6]:
# torch.manual_seed(12345)
# dataset = dataset.shuffle()

# halfway_point = int(len(dataset)/2)
# train_dataset = dataset[:halfway_point]
# test_dataset = dataset[2000:]

# print(f'Number of training graphs: {len(train_dataset)}')
# print(f'Number of test graphs: {len(test_dataset)}')

In [7]:
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [8]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, RGCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features, num_classes):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

class RGCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_relations, num_node_features, num_classes):
        super(RGCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = RGCNConv(num_node_features, hidden_channels, num_relations, num_bases=min(num_relations, 10))
        self.conv2 = RGCNConv(hidden_channels, hidden_channels, num_relations, num_bases=min(num_relations, 10))
        self.conv3 = RGCNConv(hidden_channels, hidden_channels, num_relations, num_bases=min(num_relations, 10))
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, edge_type, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index, edge_type)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_type)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_type)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

In [9]:
from sklearn.metrics import f1_score, precision_score, recall_score



def train(model, use_relation_types,criterion, optimizer, train_loader):
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        # No idea why dtypes suddenly are wrong, need to remove this later because very bad for performance
         data.edge_index = data.edge_index.long()
         data.y = data.y.long()
         data.edge_types = data.edge_types.long()
         data = data.to(device)
         if use_relation_types:
            out = model(data.x, data.edge_index, data.edge_types, data.batch)  # Perform a single forward pass.
         else:
            out = model(data.x, data.edge_index, data.batch)
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader, model, use_relation_types):
     model.eval()

     all_preds = []
     all_labels = []
     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         data.edge_index = data.edge_index.long()
         data.y = data.y.long()
         data.edge_types = data.edge_types.long()
         data = data.to(device)
         if use_relation_types:
            out = model(data.x, data.edge_index, data.edge_types, data.batch)  # Perform a single forward pass.
         else:
            out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.

         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
         all_preds.extend(pred.cpu().numpy())
         all_labels.extend(data.y.cpu().numpy())

     f1 = f1_score(all_labels, all_preds)
     test_prec = precision_score(all_labels, all_preds)
     test_recall = recall_score(all_labels, all_preds)
     return f1, (correct / len(loader.dataset)), test_prec, test_recall  # Derive ratio of correct predictions.



In [10]:
from torchviz import make_dot

In [19]:
import tqdm.notebook as tq
import copy

def run_experiment(schema='scierc', mode='OR', use_relation_types=True, use_embeddings=True, silent=False, run_num=0, key=''):
  # Load dataset from disk
  with open(f'/content/drive/MyDrive/subgraph_datasets/{schema}_{mode}.pkl', 'rb') as f:
    dataset = pickle.load(f)

  #If not use learned embeddings make all embeddings random
  if not use_embeddings:
    for data in dataset:
        # generate random features for each node
        # assuming feature size is d, you might need to adjust the size
        d = data.x.shape[1]  # get the dimension of node features
        data.x = torch.randn((data.num_nodes, d))  # generate random features


  if not silent: print_dataset_stats(dataset)



  # Separate in 3/4 train, 1/4 test
  torch.manual_seed(run_num)
  dataset = dataset.shuffle()
  split_point = int(len(dataset)/4) * 3
  train_dataset = dataset[:split_point]

  val_test_dataset = dataset[split_point:]
  split_point_test = int(len(val_test_dataset)/4) * 2
  val_dataset = val_test_dataset[:split_point_test]
  test_dataset = val_test_dataset[split_point_test:]


  if not silent:
    print(f'Number of training graphs: {len(train_dataset)}')
    print_class_imbalance(train_dataset, 'train')
    print(f'Number of test graphs: {len(val_dataset)}')
    print_class_imbalance(train_dataset, 'test')

  # Create dataloaders
  train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

  if not use_relation_types:
    model = GCN(hidden_channels=64, num_node_features=dataset.num_node_features, num_classes= 2)
  # Ugly hack, but should be conceptually valid
  else: model = RGCN(hidden_channels=64, num_relations= dataset.num_relations+1 if schema!='covid-event' else 471, num_node_features=dataset.num_node_features, num_classes= 2)

  # for data in train_loader:
  #   temp = data
  # make_dot(temp.y.long(), params=dict(list(model.named_parameters())), show_attrs=True, show_saved=True).render("rnn_torchviz", format="png")

  model = model.to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
  optimizer.zero_grad()
  criterion = torch.nn.CrossEntropyLoss()

  best_val_f1 = 0  # Initialize the best test accuracy
  best_model_state = None  # Initialize the best model state
  patience = 30  # Number of epochs to wait for improvement
  epochs_without_improvement = 0  # Initialize counter

  tqdm_iterator = tq.tqdm(range(1, 171), leave=False)
  tqdm_iterator.set_description('evaluating: ' + ' '.join(key.split('_')) + '...')
  for epoch in tqdm_iterator:
      train(model, use_relation_types, criterion, optimizer, train_loader)
      train_f1, train_acc, train_prec, train_recall = test(train_loader, model, use_relation_types)
      val_f1, val_acc, val_prec, val_recall = test(val_loader, model, use_relation_types)

      if val_f1 > best_val_f1:
          best_val_f1 = val_f1
          best_model_state = copy.deepcopy(model.state_dict())

          epochs_without_improvement = 0  # Reset counter if performance improved
      else:
          epochs_without_improvement += 1  # Increment counter if no improvement

      if epochs_without_improvement == patience or best_val_f1 > 0.9999:
          if not silent: print('Early stopping due to no improvement in test F1.')
          tqdm_iterator.container.close()

          # Load the model that was best in this run and run appropriate tests
          model.load_state_dict(best_model_state)
          test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
          test_f1, test_acc, test_prec, test_recall = test(test_loader, model, use_relation_types)
          return test_f1, test_prec, test_recall
          break  # Stop training process if no improvement for 'patience' epochs

      if not silent: print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}')

  test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
  test_f1, test_acc, test_prec, test_recall = test(test_loader, model, use_relation_types)
  return test_f1, test_prec, test_recall

In [20]:
from google.colab import drive

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import pandas as pd
from IPython.display import display, clear_output

def print_results(performances, to_latex=False):
  # Parsing keys and calculating average performance
  data = []
  for k, v in performances.items():
      try:
        dataset, mode, embedding, relation, metric_name = k.split('_')
        try:
          avg_performance = sum(v) / len(v)
        except Exception as e:
          print('Error calculating avg score', k, v, traceback.format_exc())
        if mode == 'OR':
          data.append([dataset, mode, embedding, relation,metric_name, avg_performance])
      except Exception as e: print('Error ni building data table', k, v, traceback.format_exc())

  # Creating DataFrame
  df = pd.DataFrame(data, columns=['Dataset', 'Mode', 'Embeddings', 'RelationTypes', 'MetricName', 'AvgPerformance'])

  # Setting multi-index for rows and columns
  df_pivot = df.pivot_table(index=['Embeddings', 'RelationTypes', 'MetricName'],
                          columns=['Dataset'],
                          values='AvgPerformance')
  df_pivot = df_pivot.style.format(decimal='.', thousands=',', precision=3)


  if to_latex:
    print(df_pivot.to_latex())
  clear_output()
  display(df_pivot)


In [22]:
# f1 = run_experiment(silent=False)
# print(f1)

In [23]:
from collections import defaultdict
import traceback

def save_to_drive(results):
  with open('/content/drive/MyDrive/results_filtered_newer.pkl', 'wb') as f:
    pickle.dump(results, f)

# results = defaultdict(list)
try:
  with open('/content/drive/MyDrive/results_filtered_newer.pkl', 'rb') as f:
      results= pickle.load(f)
except Exception: pass


run_num = 0
for schema in tq.tqdm(['covid-event',
                      #  'ace-event','genia', 'scierc','None', 'ace05'
                       ]):
  for mode in tq.tqdm(['OR',
                        # 'AND'
                       ], leave=False):
    for use_rels in tq.tqdm([True, False], leave=False):
      for use_embeds in tq.tqdm([True, False], leave=False):
        key = schema + '_' + mode + '_' + str(use_rels) + '_' + str(use_embeds)
        if not key + '_f1' in results:
          results[key] = []
        for i in tq.tqdm(range(2 - len(results[key])), leave=False):
          try:
            f1, prec, recall = run_experiment(schema=schema,mode=mode, use_relation_types=use_rels, use_embeddings=use_embeds, run_num=run_num, silent=False, key=key)
            results[key + '_f1'].append(f1)
            results[key + '_prec'].append(prec)
            results[key + '_recall'].append(recall)
            run_num+=1
            save_to_drive(results)
            print_results(results)
            if len(results[key]) == 2:
              break
          except Exception as e:
            print(schema, mode, use_rels, use_embeds, 'ERROR: /n', traceback.format_exc())



Unnamed: 0_level_0,Unnamed: 1_level_0,Dataset,None,ace-event,ace05,covid-event,genia,scierc
Embeddings,RelationTypes,MetricName,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,False,f1,0.544,0.494,0.469,0.594,0.403,0.56
False,False,prec,0.481,0.489,0.452,0.507,0.423,0.478
False,False,recall,0.629,0.506,0.497,0.723,0.402,0.684
False,True,f1,0.878,0.737,0.864,0.906,0.375,0.884
False,True,prec,0.871,0.723,0.888,0.903,0.387,0.877
False,True,recall,0.887,0.757,0.842,0.911,0.376,0.893
True,False,f1,0.602,0.511,0.527,0.62,0.369,0.644
True,False,prec,0.536,0.462,0.486,0.518,0.354,0.556
True,False,recall,0.69,0.58,0.58,0.775,0.394,0.771
True,True,f1,0.99,0.964,0.978,0.994,0.579,0.991
