<a href="https://colab.research.google.com/github/Kitsunnneee/Specific-Task-1/blob/main/Specific_Task_1(Contrastive_Learning).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing all the neccessary dependencies

In [1]:
import torch
import os
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install h5py
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install PyGCL
!pip install dgl
!pip install pytorch_metric_learning

2.2.1+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
Collecting PyGCL
  Downloading PyGCL-0.1.2-py3-none-any.whl (32 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->PyGCL)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->PyGCL)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64

# Importing all the neccessary dependencies
Note : This project uses PyTorch Geometric Contrastive Learning(PyGCL), a PyTorch-based, library for all the Graph Contrastive learning task.

In [2]:
import numpy as np
import h5py
import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import kneighbors_graph

import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Linear, ReLU
from torch.optim import Adam

from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

import GCL.augmentors as A
import GCL.losses as L
from GCL.models import DualBranchContrast

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


# Mounting and Loading the data from Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = "/content/drive/MyDrive/quark-gluon_data-set_n139306.hdf5" #Path to the dataset on my google drive

with h5py.File(path, 'r') as f:
  X_jets = np.array(f['X_jets'][:8000])
  labels = np.array(f['y'][:8000])

# Converting the data to Graph format and doing preprocessing

In [5]:
dataset = []
for i, x in enumerate(X_jets):
  flattened = x.reshape(-1,3)
  non_zero = np.any(flattened != (0,0,0), axis = -1) # Removing any zero element by considering only non zero ones
  node = flattened[non_zero]
  edges = kneighbors_graph(node, 2, mode = 'connectivity',include_self = True)
  edges = edges.tocoo()
  y = torch.tensor([int(labels[i])], dtype=torch.long)
  data = Data(x=torch.from_numpy(node), edge_index=torch.from_numpy(np.vstack((edges.row,edges.col))).type(torch.long), edge_attr=torch.from_numpy(edges.data.reshape(-1,1)), y=y)
  dataset.append(data)

In [6]:
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {dataset[0].num_nodes}')
print(f'Number of edges: {dataset[0].num_edges}')
print(f'Number of node features: {dataset[0].num_node_features}')
print(f'Number of edges features: {dataset[0].num_edge_features}')
print(dataset[0])

Number of graphs: 8000
Number of nodes: 884
Number of edges: 1768
Number of node features: 3
Number of edges features: 1
Data(x=[884, 3], edge_index=[2, 1768], edge_attr=[1768, 1], y=[1])


In [7]:
train_loader = DataLoader(dataset[:5000], batch_size=8, shuffle=True)  #Creating the train loader with batch = 8
test_loader = DataLoader(dataset[5000:], batch_size=8, shuffle=False) # Creating the test loader with batch = 8

In [8]:
aug = A.Compose([A.EdgeRemoving(pe=0.3), A.FeatureMasking(pf=0.3)]) # Selecing the graph augmentations

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Creating the Contrastive model

In [10]:
class GCN(nn.Module):
    def __init__(self, xavier=True):
        super(GCN, self).__init__()

        self.conv1 = GCNConv(3, 32)
        self.conv2 = GCNConv(32, 32)
        self.fc1 = Linear(32, 32)
        self.fc2 = Linear(32, 32)
        self.act = ReLU()

    def forward(self, data):
          # Performing the augmentaion twice as we use dual branch contrastive learning
          augm_1 = aug(data.x, data.edge_index)
          augm_2 = aug(data.x, data.edge_index)

          x1 = self.conv1(augm_1[0], augm_1[1])
          x1 = self.act(x1)
          x2 = self.conv2(x1, augm_1[1])
          z1 = self.act(x2)

          x1 = self.conv1(augm_2[0], augm_2[1])
          x1 = self.act(x1)
          x2 = self.conv2(x2, augm_2[1])
          z2 = self.act(x2)

          x1 = self.conv1(data.x, data.edge_index)
          x1 = self.act(x1)
          x2 = self.conv2(x1, data.edge_index)
          z = self.act(x2)

          return z, z1, z2

    def project(self, z: torch.Tensor) -> torch.Tensor:
          #Projection head to reduce the size of the embeddings
          z = F.elu(self.fc1(z))
          return self.fc2(z)

# Training the contrastive learning model

In [11]:
def train(encoder_model, contrast_model, data, optimizer):
    encoder_model.train()
    optimizer.zero_grad()
    z, z1, z2 = encoder_model(data)
    h1, h2 = [encoder_model.project(x) for x in [z1, z2]] # Creating the reduced embeddings for the contrastive learning
    loss = contrast_model(h1, h2)
    loss.backward()
    optimizer.step()
    return loss.item()

In [12]:
def test(encoder_model, contrast_mocel, data, optimizer):
    encoder_model.eval()
    z, z1, z2 = encoder_model(data)
    h1, h2 = [encoder_model.project(x) for x in [z1, z2]] # Creating the reduced embeddings for the contrastive learning
    loss = contrast_model(h1, h2)
    return loss.item()

In [13]:
encoder_model = GCN().to(device)
#Using Dual Branch Contrastive Learning with InfoNCE loss and using local-to-local mode[to learn local representation]
contrast_model = DualBranchContrast(loss=L.InfoNCE(tau=0.2), mode='L2L').to(device)
optimizer = Adam(encoder_model.parameters(), lr=0.01)

for epoch in range(30):
  total_loss = 0
  for _, data in enumerate(tqdm.tqdm(train_loader)):
      data = data.to(device)
      train_loss = train(encoder_model, contrast_model, data, optimizer)
  for _, data in enumerate(tqdm.tqdm(test_loader)):
      data = data.to(device)
      test_loss = test(encoder_model, contrast_model, data, optimizer)

  log = "Epoch {}, Train Loss: {:.3f}, Test Loss: {:.3f}"
  print(log.format(epoch, train_loss, test_loss))

#Save Model
torch.save(encoder_model.state_dict(), 'autoencoder_weights.pth')
print("Encoder weights saved successfully!")


100%|██████████| 625/625 [00:35<00:00, 17.83it/s]
100%|██████████| 375/375 [00:12<00:00, 30.08it/s]


Epoch 0, Train Loss: 8.071, Test Loss: 7.537


100%|██████████| 625/625 [00:32<00:00, 19.04it/s]
100%|██████████| 375/375 [00:12<00:00, 29.49it/s]


Epoch 1, Train Loss: 7.330, Test Loss: 6.931


100%|██████████| 625/625 [00:32<00:00, 19.10it/s]
100%|██████████| 375/375 [00:12<00:00, 30.08it/s]


Epoch 2, Train Loss: 6.880, Test Loss: 7.830


100%|██████████| 625/625 [00:32<00:00, 19.11it/s]
100%|██████████| 375/375 [00:12<00:00, 30.03it/s]


Epoch 3, Train Loss: 6.609, Test Loss: 6.658


100%|██████████| 625/625 [00:32<00:00, 19.14it/s]
100%|██████████| 375/375 [00:18<00:00, 20.75it/s]


Epoch 4, Train Loss: 6.332, Test Loss: 6.410


100%|██████████| 625/625 [00:33<00:00, 18.76it/s]
100%|██████████| 375/375 [00:12<00:00, 30.07it/s]


Epoch 5, Train Loss: 6.679, Test Loss: 7.155


100%|██████████| 625/625 [00:33<00:00, 18.80it/s]
100%|██████████| 375/375 [00:12<00:00, 29.93it/s]


Epoch 6, Train Loss: 6.109, Test Loss: 6.331


100%|██████████| 625/625 [00:32<00:00, 19.08it/s]
100%|██████████| 375/375 [00:12<00:00, 30.03it/s]


Epoch 7, Train Loss: 7.140, Test Loss: 6.191


100%|██████████| 625/625 [00:32<00:00, 18.96it/s]
100%|██████████| 375/375 [00:12<00:00, 30.09it/s]


Epoch 8, Train Loss: 6.235, Test Loss: 6.098


100%|██████████| 625/625 [00:32<00:00, 19.15it/s]
100%|██████████| 375/375 [00:12<00:00, 30.28it/s]


Epoch 9, Train Loss: 5.790, Test Loss: 5.789


100%|██████████| 625/625 [00:32<00:00, 18.94it/s]
100%|██████████| 375/375 [00:12<00:00, 29.97it/s]


Epoch 10, Train Loss: 6.492, Test Loss: 6.982


100%|██████████| 625/625 [00:32<00:00, 19.23it/s]
100%|██████████| 375/375 [00:12<00:00, 30.22it/s]


Epoch 11, Train Loss: 6.332, Test Loss: 6.913


100%|██████████| 625/625 [00:32<00:00, 19.07it/s]
100%|██████████| 375/375 [00:12<00:00, 30.26it/s]


Epoch 12, Train Loss: 6.874, Test Loss: 5.855


100%|██████████| 625/625 [00:32<00:00, 19.21it/s]
100%|██████████| 375/375 [00:12<00:00, 29.86it/s]


Epoch 13, Train Loss: 5.663, Test Loss: 6.481


100%|██████████| 625/625 [00:33<00:00, 18.64it/s]
100%|██████████| 375/375 [00:12<00:00, 29.50it/s]


Epoch 14, Train Loss: 7.655, Test Loss: 6.290


100%|██████████| 625/625 [00:32<00:00, 18.96it/s]
100%|██████████| 375/375 [00:12<00:00, 29.86it/s]


Epoch 15, Train Loss: 5.518, Test Loss: 5.478


100%|██████████| 625/625 [00:33<00:00, 18.90it/s]
100%|██████████| 375/375 [00:12<00:00, 29.48it/s]


Epoch 16, Train Loss: 5.534, Test Loss: 5.860


100%|██████████| 625/625 [00:32<00:00, 19.06it/s]
100%|██████████| 375/375 [00:12<00:00, 29.68it/s]


Epoch 17, Train Loss: 6.940, Test Loss: 5.489


100%|██████████| 625/625 [00:33<00:00, 18.84it/s]
100%|██████████| 375/375 [00:12<00:00, 29.80it/s]


Epoch 18, Train Loss: 5.568, Test Loss: 5.293


100%|██████████| 625/625 [00:32<00:00, 18.96it/s]
100%|██████████| 375/375 [00:12<00:00, 29.60it/s]


Epoch 19, Train Loss: 5.250, Test Loss: 5.292


100%|██████████| 625/625 [00:33<00:00, 18.72it/s]
100%|██████████| 375/375 [00:12<00:00, 30.05it/s]


Epoch 20, Train Loss: 6.022, Test Loss: 5.888


100%|██████████| 625/625 [00:33<00:00, 18.94it/s]
100%|██████████| 375/375 [00:12<00:00, 30.16it/s]


Epoch 21, Train Loss: 5.067, Test Loss: 5.232


100%|██████████| 625/625 [00:32<00:00, 18.94it/s]
100%|██████████| 375/375 [00:12<00:00, 30.32it/s]


Epoch 22, Train Loss: 5.423, Test Loss: 6.408


100%|██████████| 625/625 [00:32<00:00, 19.22it/s]
100%|██████████| 375/375 [00:12<00:00, 30.31it/s]


Epoch 23, Train Loss: 6.304, Test Loss: 5.331


100%|██████████| 625/625 [00:32<00:00, 19.08it/s]
100%|██████████| 375/375 [00:12<00:00, 30.29it/s]


Epoch 24, Train Loss: 5.226, Test Loss: 5.331


100%|██████████| 625/625 [00:32<00:00, 19.23it/s]
100%|██████████| 375/375 [00:12<00:00, 30.25it/s]


Epoch 25, Train Loss: 5.049, Test Loss: 5.108


100%|██████████| 625/625 [00:32<00:00, 19.12it/s]
100%|██████████| 375/375 [00:12<00:00, 29.65it/s]


Epoch 26, Train Loss: 5.934, Test Loss: 5.471


100%|██████████| 625/625 [00:32<00:00, 19.22it/s]
100%|██████████| 375/375 [00:12<00:00, 30.27it/s]


Epoch 27, Train Loss: 5.156, Test Loss: 5.101


100%|██████████| 625/625 [00:32<00:00, 19.17it/s]
100%|██████████| 375/375 [00:12<00:00, 30.36it/s]


Epoch 28, Train Loss: 5.307, Test Loss: 5.306


100%|██████████| 625/625 [00:32<00:00, 19.25it/s]
100%|██████████| 375/375 [00:12<00:00, 30.01it/s]

Epoch 29, Train Loss: 5.166, Test Loss: 6.304
Encoder weights saved successfully!





# Defining the classifcation model
Here we use the model defined for learning representation before but without the projection head as we only need the learned represntation

In [14]:
class GraphClassificationModel(nn.Module):
    def __init__(self, load=True):
        super(GraphClassificationModel, self).__init__()

        self.encoder = GCN().to(device)

        if load:
            pth = self.encoder.load_state_dict(torch.load('autoencoder_weights.pth'))
            for param in self.encoder.parameters():
                param.requires_grad = False # Freezing the learned weights of encoder

        self.classifier = nn.Linear(32, 2)


    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        embeddings,_,_ = self.encoder(data)
        z = global_mean_pool(embeddings, batch)
        pred = self.classifier(z)
        return pred

# Training and Testing of the Classification model

In [23]:
def train_classification(model, loader, optimizer, criterion):
  model.train()
  total_loss = 0
  correct = 0
  total_samples = 0
  for _, data in enumerate(tqdm.tqdm(train_loader)):
    # print(data.batch.size)
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data)
    #print(out.shape)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    total_loss += loss.item() * data.num_graphs
    # Calculate train accuracy
    pred = out.argmax(dim=1)
    correct += (pred == data.y).sum().item()
    total_samples += data.num_graphs
  train_accuracy = correct / total_samples
  return total_loss / len(loader.dataset), train_accuracy



In [24]:
def test_classification(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item() #Calculating the correct predictions
            total += data.num_graphs
    accuracy = correct / total
    return accuracy

In [25]:
model = GraphClassificationModel().to(device)
optimizer_2 = Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

for epoch in range(20):
    train_loss, train_accuracy = train_classification(model, train_loader, optimizer_2, criterion)
    test_accuracy = test_classification(model, test_loader)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')


100%|██████████| 625/625 [00:06<00:00, 103.42it/s]


Epoch 1, Train Loss: 0.6547, Train Accuracy: 0.6292, Test Accuracy: 0.6143


100%|██████████| 625/625 [00:05<00:00, 113.81it/s]


Epoch 2, Train Loss: 0.6370, Train Accuracy: 0.6598, Test Accuracy: 0.6507


100%|██████████| 625/625 [00:06<00:00, 102.85it/s]


Epoch 3, Train Loss: 0.6361, Train Accuracy: 0.6620, Test Accuracy: 0.5960


100%|██████████| 625/625 [00:05<00:00, 119.59it/s]


Epoch 4, Train Loss: 0.6309, Train Accuracy: 0.6676, Test Accuracy: 0.6873


100%|██████████| 625/625 [00:05<00:00, 122.45it/s]


Epoch 5, Train Loss: 0.6304, Train Accuracy: 0.6682, Test Accuracy: 0.6447


100%|██████████| 625/625 [00:05<00:00, 107.31it/s]


Epoch 6, Train Loss: 0.6360, Train Accuracy: 0.6688, Test Accuracy: 0.6750


100%|██████████| 625/625 [00:05<00:00, 123.73it/s]


Epoch 7, Train Loss: 0.6305, Train Accuracy: 0.6658, Test Accuracy: 0.6917


100%|██████████| 625/625 [00:05<00:00, 119.19it/s]


Epoch 8, Train Loss: 0.6343, Train Accuracy: 0.6710, Test Accuracy: 0.6930


100%|██████████| 625/625 [00:06<00:00, 98.48it/s] 


Epoch 9, Train Loss: 0.6294, Train Accuracy: 0.6692, Test Accuracy: 0.6357


100%|██████████| 625/625 [00:05<00:00, 113.41it/s]


Epoch 10, Train Loss: 0.6240, Train Accuracy: 0.6686, Test Accuracy: 0.6013


100%|██████████| 625/625 [00:05<00:00, 114.18it/s]


Epoch 11, Train Loss: 0.6328, Train Accuracy: 0.6702, Test Accuracy: 0.6217


100%|██████████| 625/625 [00:05<00:00, 104.69it/s]


Epoch 12, Train Loss: 0.6187, Train Accuracy: 0.6754, Test Accuracy: 0.6733


100%|██████████| 625/625 [00:05<00:00, 117.21it/s]


Epoch 13, Train Loss: 0.6382, Train Accuracy: 0.6680, Test Accuracy: 0.6500


100%|██████████| 625/625 [00:05<00:00, 108.69it/s]


Epoch 14, Train Loss: 0.6324, Train Accuracy: 0.6718, Test Accuracy: 0.6480


100%|██████████| 625/625 [00:05<00:00, 107.79it/s]


Epoch 15, Train Loss: 0.6332, Train Accuracy: 0.6734, Test Accuracy: 0.6827


100%|██████████| 625/625 [00:05<00:00, 122.53it/s]


Epoch 16, Train Loss: 0.6249, Train Accuracy: 0.6762, Test Accuracy: 0.5250


100%|██████████| 625/625 [00:05<00:00, 117.88it/s]


Epoch 17, Train Loss: 0.6292, Train Accuracy: 0.6746, Test Accuracy: 0.6940


100%|██████████| 625/625 [00:05<00:00, 108.11it/s]


Epoch 18, Train Loss: 0.6268, Train Accuracy: 0.6764, Test Accuracy: 0.6627


100%|██████████| 625/625 [00:05<00:00, 123.18it/s]


Epoch 19, Train Loss: 0.6281, Train Accuracy: 0.6730, Test Accuracy: 0.7013


100%|██████████| 625/625 [00:05<00:00, 110.04it/s]


Epoch 20, Train Loss: 0.6274, Train Accuracy: 0.6752, Test Accuracy: 0.7027


# Conclusion
The model's accuracy is 70% which is not the best. There are a multitude of reasons for that.

- One big problem is graph-level representation. Although, I have used global pooling to get a graph-level representation that is not the best way.

- We only consider an extremely small subset of the actual data due to memory issues which may cause data imbalance which stops the model from learning properly.

- Another problem is the graph representation isn't being learned well. Many possible reasons can be for this such as the architecture may not be right, the parameter tuning needs to be done well, etc. Further Research into this is required.

- When constructing the contrastive learning architecture other Graph models may be used such as GAT, GraphSage, etc  to learn the representation. Each of these models will learn a different representation for the node which may be better or worse but may increase the complexity of the model which may be computationally inefficient for larger datasets and graphs or also decrease.







