In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import warnings
warnings.filterwarnings('ignore')

2.1.2+cu121
12.1


In [2]:
import torch
import matplotlib.pyplot as plt
import numpy as np

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_dense_adj
import torch.nn.functional as F
import pandas as pd

import os
from sklearn import decomposition
from sklearn.manifold import TSNE
import pandas as pd

from torch_geometric.nn import GCNConv
from torch.nn import Linear
from torch_geometric.nn import GATConv
from sklearn.decomposition import PCA
import numpy as np

import random
import networkx as nx
from torch_geometric.utils import from_scipy_sparse_matrix, coalesce,remove_self_loops, to_networkx, from_networkx
import scipy.sparse as sp

In [3]:
class GCN(torch.nn.Module):
    def __init__(self, input_feature, nclasses):
        super().__init__()
        
        self.input_feature = input_feature
        self.nclasses = nclasses
        #define layers
        self.conv1 = GCNConv(in_channels=self.input_feature,out_channels=1024)
        self.conv2 = GCNConv(in_channels=1024, out_channels=512)
        self.conv3 = GCNConv(in_channels= 512, out_channels=512)
        self.classifier = Linear(512,self.nclasses)

    def forward(self,x, edge_idx):
        a = self.conv1(x,edge_idx)
        a= a.tanh()
        a = self.conv2(a,edge_idx)
        a = a.tanh()
        a = self.conv3(a,edge_idx)
        a= a.tanh()
        out = self.classifier(a)

        return out,a

class GAT(torch.nn.Module):
    def __init__(self, input_feature, nclasses):
        super().__init__()
        
        self.input_feature = input_feature
        self.nclasses = nclasses
        #define layers
        self.conv1 = GATConv(in_channels=self.input_feature,out_channels=1024)
        self.conv2 = GATConv(in_channels=1024, out_channels=512)
        self.conv3 = GATConv(in_channels= 512, out_channels=512)
        self.classifier = Linear(512,self.nclasses)

    def forward(self,x, edge_idx):
        a = self.conv1(x,edge_idx)
        a= a.tanh()
        a = self.conv2(a,edge_idx)
        a = a.tanh()
        a = self.conv3(a,edge_idx)
        a= a.tanh()
        out = self.classifier(a)

        return out,a

In [4]:
class training():
    def __init__(self, model, data, mask = None, augment_edge_idx= None):
        self.model = model
        self.dataset = data
        self.criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.00012)  # Define optimizer.
        if mask == None:
            self.train_old_mask = self.dataset.data.train_mask
        else:
            self.train_old_mask = self.dataset.data.train_mask
            self.dataset.data.train_mask = mask
        self.embed = 0
        
        self.old_edge_idx = self.dataset.data.edge_index
        if augment_edge_idx != None:
            self.dataset.data.edge_index = augment_edge_idx
            
        
        #print("---Total training points --- ", )
        
        self.best_accuracy=0


    def train(self, data):
        self.optimizer.zero_grad()  # Clear gradients.
        out, h = self.model.forward(data.x, data.edge_index)  # Perform a single forward pass.

        loss = self.criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.

        loss.backward()  # Derive gradients.
        self.optimizer.step()  # Update parameters based on gradients.
        return loss, h, out

    def train_model(self, epo=200):
        
        data = self.dataset[0]
        #print("Shape is', data.x.shape)
        loss_t = []

        train_loss = []
        val_loss = []
        test_loss = []

        test_accuracy = []
        train_accuracy = []
        val_accuracy = []

        ep=[]

        macro_f1_train = []
        macro_f1_val =[]
        macro_f1_test=[]

        print('Training Started...')
        for epoch in range(1,epo):
            loss, h, out = self.train(data)
            loss_t+=[loss]

            if epoch % 5 == 0:
                #visualize_embedding(h, color=data.y, epoch=epoch, loss=loss)
                ep.append(epoch)

                with torch.no_grad():
                    
                    train_loss.append(self.criterion(out[data.train_mask], data.y[data.train_mask]).item())
                    val_loss.append(self.criterion(out[data.val_mask], data.y[data.val_mask]).item())
                    test_loss.append(self.criterion(out[data.test_mask], data.y[data.test_mask]).item())

                    train_accuracy.append(((torch.argmax(out[data.train_mask], axis=1) == data.y[data.train_mask]).sum()/len(data.y[data.train_mask])).item()*100)
                    val_accuracy.append(((torch.argmax(out[data.val_mask], axis=1) == data.y[data.val_mask]).sum()/len(data.y[data.val_mask])).item()*100)
                    
                    test_acc = ((torch.argmax(out[data.test_mask], axis=1) == data.y[data.test_mask]).sum()/len(data.y[data.test_mask])).item()*100
                    test_accuracy.append(test_acc)
                    if test_acc> self.best_accuracy:
                        self.best_accuracy = test_acc
                        print("Saving Model at acc: ", self.best_accuracy)
                        torch.save(self.model.state_dict(), f'Best_model.pt')

                    print(f'Epoch : {epoch:.2f}, Training Accuracy: {train_accuracy[-1]:.2f}, Testing Accuracy: {test_accuracy[-1]:.2f}')


        print('Training Finished!')
        print("Best accuracy: ", self.best_accuracy )
        self.dataset.data.train_mask = self.train_old_mask
        self.dataset.data.edge_index = self.old_edge_idx
        with torch.no_grad():
            self.model.load_state_dict(torch.load('Best_model.pt'))

In [5]:
def mask_to_index(index, size):
    all_idx = np.arange(size)
    return all_idx[index]

def index_to_mask(index, size):
    mask = torch.zeros((size, ), dtype=torch.bool)
    mask[index] = 1
    return mask

In [6]:
class arguments():
    def __init__(self, seed):
        self.seed = seed

In [7]:
def random_select(class_labels, count):
    selected_idx = []
    selected_dict = {}
    for class_id, idx in class_labels.items():
        selected = np.random.permutation(idx)
        print("class_id: ", class_id, " selected: ", selected[:count])
        selected_dict[class_id] = selected[:count]
        selected_idx.extend(selected[:count])

    return selected_idx, selected_dict
    
    

In [8]:
def get_augmented_edges(selected_dict, data):
    # Extract the number of classes in the dataset
    num_classes = data.y.max().item() + 1

    # Initialize an empty set to store unique edges
    unique_edges_augmented = set()
    unique_edges_existing = set()

    # Iterate through each class in selected_idx
    for class_id, nodes in selected_dict.items():
        # Generate all possible combinations of nodes within the same class
        combinations = [(i, j) for i in nodes for j in nodes if i != j]

        # Add unique combinations to the set
        unique_edges_augmented.update(combinations)
        

#     print(unique_edges_augmented)
    new_edge_2 = torch.tensor(list(unique_edges_augmented)).t().contiguous()
    # Find unique edges in the existing data.edge_index
    unique_edges_existing = {(i.item(), j.item()) for i, j in zip(*data.edge_index)}

    # Find unique edges in augmented_edge_index
    unique_edges_augmented -= unique_edges_existing

    augmented_edge_index = torch.tensor(list(unique_edges_augmented)).t().contiguous()
    new_edge_index = torch.cat([data.edge_index, augmented_edge_index], dim=1).contiguous()
    
    return new_edge_index
    

In [9]:
# set seed
args = arguments(1)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# set data
dataname = "cora"
data = Planetoid(root='data', name= dataname)
idx_train = mask_to_index(data.train_mask, len(data.train_mask))

# idx_train
label_train = data.y[data.train_mask]

class_labels = {}
for i in range(data.num_classes):
    class_labels[i] = idx_train[label_train==i]
    

count = 5 # numbers to be selected per class


selected_idx, selected_dict = random_select(class_labels, count)

# find new mask and new edge list based on augmentation 
new_mask = index_to_mask(selected_idx, len(data.train_mask) )
new_edge_index = get_augmented_edges(selected_dict, data)


class_id:  0  selected:  [11 94 52 65 10]
class_id:  1  selected:  [135 112 136  85 139]
class_id:  2  selected:  [ 34  16 110  72  71]
class_id:  3  selected:  [24 40 14 15 51]
class_id:  4  selected:  [82 86  2 84 78]
class_id:  5  selected:  [114  37 129 115 117]
class_id:  6  selected:  [ 92  41 116  56 123]


In [14]:
# run without out mask
dataname = "cora"
data = Planetoid(root='data', name= dataname)
model = GCN(data.num_features,data.num_classes)
exp = training(model, data)
exp.train_model()

Training Started...
Saving Model at acc:  52.49999761581421
Epoch : 5.00, Training Accuracy: 70.00, Testing Accuracy: 52.50
Saving Model at acc:  70.20000219345093
Epoch : 10.00, Training Accuracy: 86.43, Testing Accuracy: 70.20
Saving Model at acc:  74.90000128746033
Epoch : 15.00, Training Accuracy: 91.43, Testing Accuracy: 74.90
Saving Model at acc:  76.2000024318695
Epoch : 20.00, Training Accuracy: 92.14, Testing Accuracy: 76.20
Saving Model at acc:  77.99999713897705
Epoch : 25.00, Training Accuracy: 93.57, Testing Accuracy: 78.00
Saving Model at acc:  79.29999828338623
Epoch : 30.00, Training Accuracy: 94.29, Testing Accuracy: 79.30
Saving Model at acc:  80.69999814033508
Epoch : 35.00, Training Accuracy: 95.71, Testing Accuracy: 80.70
Saving Model at acc:  80.80000281333923
Epoch : 40.00, Training Accuracy: 97.86, Testing Accuracy: 80.80
Saving Model at acc:  81.19999766349792
Epoch : 45.00, Training Accuracy: 97.86, Testing Accuracy: 81.20
Epoch : 50.00, Training Accuracy: 98.

In [13]:
sum(new_mask)

tensor(35)

In [15]:
# run with out mask
dataname = "cora"
data = Planetoid(root='data', name= dataname)
model = GCN(data.num_features,data.num_classes)
exp1 = training(model, data, new_mask)
exp1.train_model()

Training Started...
Saving Model at acc:  41.200000047683716
Epoch : 5.00, Training Accuracy: 91.43, Testing Accuracy: 41.20
Saving Model at acc:  63.099998235702515
Epoch : 10.00, Training Accuracy: 97.14, Testing Accuracy: 63.10
Saving Model at acc:  68.80000233650208
Epoch : 15.00, Training Accuracy: 97.14, Testing Accuracy: 68.80
Saving Model at acc:  71.8999981880188
Epoch : 20.00, Training Accuracy: 100.00, Testing Accuracy: 71.90
Saving Model at acc:  75.0
Epoch : 25.00, Training Accuracy: 100.00, Testing Accuracy: 75.00
Saving Model at acc:  75.40000081062317
Epoch : 30.00, Training Accuracy: 100.00, Testing Accuracy: 75.40
Epoch : 35.00, Training Accuracy: 100.00, Testing Accuracy: 74.40
Epoch : 40.00, Training Accuracy: 100.00, Testing Accuracy: 74.00
Epoch : 45.00, Training Accuracy: 100.00, Testing Accuracy: 72.90
Epoch : 50.00, Training Accuracy: 100.00, Testing Accuracy: 72.10
Epoch : 55.00, Training Accuracy: 100.00, Testing Accuracy: 71.50
Epoch : 60.00, Training Accura

In [16]:
# run with out mask and augmentation
dataname = "cora"
data = Planetoid(root='data', name= dataname)

model = GCN(data.num_features,data.num_classes)
exp2 = training(model, data, new_mask, new_edge_index)
exp2.train_model()

Training Started...
Saving Model at acc:  50.300002098083496
Epoch : 5.00, Training Accuracy: 100.00, Testing Accuracy: 50.30
Saving Model at acc:  67.90000200271606
Epoch : 10.00, Training Accuracy: 100.00, Testing Accuracy: 67.90
Saving Model at acc:  69.70000267028809
Epoch : 15.00, Training Accuracy: 100.00, Testing Accuracy: 69.70
Saving Model at acc:  71.10000252723694
Epoch : 20.00, Training Accuracy: 100.00, Testing Accuracy: 71.10
Saving Model at acc:  71.70000076293945
Epoch : 25.00, Training Accuracy: 100.00, Testing Accuracy: 71.70
Saving Model at acc:  72.60000109672546
Epoch : 30.00, Training Accuracy: 100.00, Testing Accuracy: 72.60
Saving Model at acc:  72.79999852180481
Epoch : 35.00, Training Accuracy: 100.00, Testing Accuracy: 72.80
Epoch : 40.00, Training Accuracy: 100.00, Testing Accuracy: 72.20
Epoch : 45.00, Training Accuracy: 100.00, Testing Accuracy: 71.80
Epoch : 50.00, Training Accuracy: 100.00, Testing Accuracy: 72.00
Epoch : 55.00, Training Accuracy: 100.00