In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import warnings
warnings.filterwarnings('ignore')

2.1.2+cu121
12.1


In [2]:
import torch
import matplotlib.pyplot as plt
import numpy as np

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_dense_adj
import torch.nn.functional as F
import pandas as pd

import os
from sklearn import decomposition
from sklearn.manifold import TSNE
import pandas as pd

from torch_geometric.nn import GCNConv
from torch.nn import Linear
from torch_geometric.nn import GATConv
from sklearn.decomposition import PCA
import numpy as np

import random
import networkx as nx
from torch_geometric.utils import from_scipy_sparse_matrix, coalesce,remove_self_loops, to_networkx, from_networkx
import scipy.sparse as sp

In [3]:
class GCN(torch.nn.Module):
    def __init__(self, input_feature, nclasses):
        super().__init__()
        
        self.input_feature = input_feature
        self.nclasses = nclasses
        #define layers
        self.conv1 = GCNConv(in_channels=self.input_feature,out_channels=1024)
        self.conv2 = GCNConv(in_channels=1024, out_channels=512)
        self.conv3 = GCNConv(in_channels= 512, out_channels=512)
        self.classifier = Linear(512,self.nclasses)

    def forward(self,x, edge_idx,edge_weight):
        a = self.conv1(x,edge_idx, edge_weight)
        a= a.tanh()
        a = self.conv2(a,edge_idx, edge_weight)
        a = a.tanh()
        a = self.conv3(a,edge_idx, edge_weight)
        a= a.tanh()
        out = self.classifier(a)

        return out,a

class GAT(torch.nn.Module):
    def __init__(self, input_feature, nclasses):
        super().__init__()
        
        self.input_feature = input_feature
        self.nclasses = nclasses
        #define layers
        self.conv1 = GATConv(in_channels=self.input_feature,out_channels=1024)
        self.conv2 = GATConv(in_channels=1024, out_channels=512)
        self.conv3 = GATConv(in_channels= 512, out_channels=512)
        self.classifier = Linear(512,self.nclasses)

    def forward(self,x, edge_idx,edge_weight):
        a = self.conv1(x,edge_idx, edge_weight)
        a= a.tanh()
        a = self.conv2(a,edge_idx, edge_weight)
        a = a.tanh()
        a = self.conv3(a,edge_idx, edge_weight)
        a= a.tanh()
        out = self.classifier(a)

        return out,a

In [4]:
class benchReady():
    def __init__(self):
        self.datanames = ["cora", "citeseer"]
        self.modeltype = ["gcn","gat"]
        self.seedlist = [1,85,105]
        self.dataset = {}
        self.benchmodels = {}
        self.boundaries = {}
        
        # run all methods to prep data
        self.get_dataset()
        self.run_bench_models()
        self.find_boundaries()
    
    def get_dataset(self):
        
        for dataname in self.datanames:
            self.dataset[dataname] = Planetoid(root='data', name= dataname)   
    
    def run_bench_models(self):
        
        for dataname in self.datanames:
            dataset = self.dataset[dataname]
            for mtype in self.modeltype:
                mtype_lower = mtype.lower()
                
                for seed in self.seedlist:
                    if mtype_lower=="gcn":
                        model = GCN(dataset.num_features,dataset.num_classes)
                    elif mtype_lower=="gat":
                        model = GAT(dataset.num_features,dataset.num_classes)
                    model.load_state_dict(torch.load(f'Bench_model_{dataname}_{mtype_lower}_{seed}.pt'))
                    self.benchmodels[(dataname,mtype_lower,seed)] = model
#                     print("Model weights", torch.load(f'Bench_model_{dataname}_{mtype_lower}_{seed}.pt'))
#                     print("model...ready",self.benchmodels)
    
    def find_boundaries(self):
        for dataname in self.datanames:
            dataset = self.dataset[dataname]
            for mtype in self.modeltype:
                mtype_lower = mtype.lower()
                for seed in self.seedlist:
                    model = self.benchmodels[(dataname,mtype_lower,seed)]
#                     print("model...ready",self.benchmodels[(dataname,mtype_lower,seed)])
                    model.eval().to(device)
                    data= dataset[0].to(device)
            #         print("-- at Find Entropy, datamask shape", data.train_mask.shape)
                    # need a forward pass to find score and embed
                    score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

                    # need to apply softmax to convert scores to probability
                    runsoftmax = torch.nn.Softmax(dim=1)
                    nprob = runsoftmax(score)
                    trainmask = data.train_mask
                    nprob_train = nprob[trainmask]

                    # Calculate entropy for each data point
                    entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
                    entropy_data

                    # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
                    # Sort the tensor
                    sorted_tensor, indices = torch.sort(entropy_data)
#                     print("benchmark: ", sorted_tensor)
                    # Calculate the range indices
                    total_elements = len(sorted_tensor)
                    low_range_end = total_elements // 3
                    mid_range_end = 2 * (total_elements // 3)

                    boundaries = [sorted_tensor[low_range_end].item(),sorted_tensor[mid_range_end].item()]
                    self.boundaries[(dataname,mtype_lower,seed)] = boundaries
                      

In [5]:
class exp():
    def __init__(self):
        
        self.binput = benchReady()
        
        self.model_list = self.binput.modeltype
        self.data_list = self.binput.datanames
        self.compression_list = [0.125,0.25,0.5]
        self.methodlist = ["one_step","gcond","sfgc","sgdd"]
        
        self.out = saveoutput(self.model_list, self.data_list, self.compression_list,self.methodlist)
    
    def prep_data(self,adj,feat):
        adj[adj<0.05]=0   # remove anything spurious

        # Convert to a NetworkX graph
        graph = nx.Graph(adj.cpu().numpy())

        # Get the edge list (edge index)
        edge_list = list(graph.edges)

        # Convert the edge list to a PyTorch tensor
        edge_index = torch.tensor(edge_list).t().contiguous().to(device)

        # Extract edge attributes (probabilities) from the adjacency matrix
        edge_attr = []
        for edge in edge_index.t().tolist():
            # Assuming the adjacency matrix is a NumPy array
            edge_attr.append(adj[edge[0], edge[1]])

        # Convert the edge_attr list to a PyTorch tensor
        edge_attr = torch.tensor(edge_attr, dtype=torch.float32)

        graph_data = Data(x=feat, edge_index=edge_index, edge_attr=edge_attr)
        
        return graph_data
    
    def run_experiment(self,pathlist):

        for key, value in pathlist.items() :
            adj_path,feat_path = value
            dataname,method,compression, abc = key
            print("==========================================")
            print(dataname, compression,method,abc)
            
            adj = torch.load(adj_path)
            feat = torch.load(feat_path).to(device)

            # print(graph_data.edge_attr)
            graph_data = self.prep_data(adj,feat)
            data= graph_data.to(device)

            for mtype in self.binput.modeltype:
                for seed in self.binput.seedlist:
                    # need a forward pass to find score and embed
                    model = self.binput.benchmodels[(dataname,mtype,seed)]
                    score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

                    # need to apply softmax to convert scores to probability
                    runsoftmax = torch.nn.Softmax(dim=1)
                    nprob = runsoftmax(score)
                    nprob_train = nprob

                    # Calculate entropy for each data point
                    entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
                    # print(entropy_data)

                    # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
                    # Sort the tensor
                    sorted_tensor, indices = torch.sort(entropy_data)
                    boundaries = self.binput.boundaries[(dataname,mtype,seed)]
#                     print(boundaries)
                    low_values = (sorted_tensor < boundaries[0])
                    mid_values = (sorted_tensor >= boundaries[0]) & (sorted_tensor < boundaries[1])
                    high_values = sorted_tensor >= boundaries[1]

                    # Count the number of True values
                    results = { "low": torch.sum(low_values).item(),
                    "mid" : torch.sum(mid_values).item(),
                    "high" : torch.sum(high_values).item(),
                    "mean" : torch.mean(sorted_tensor).item()      
                    }
                    self.out.update(results,[dataname,compression,mtype,method,seed])
        self.out.display_save()            

#                     print(results)

In [6]:
class saveoutput():
    def __init__(self, model_list, data_list, compression_list,methodlist):
        self.modeltype = model_list
        self.index = ["low","mid","high","mean"]
        self.data = data_list #["cora", "citeseer"]
        self.compression = compression_list # [0.25,0.5]
        self.methods = methodlist

        # create tables 3seeds - 1,15,85 
        self.results_1 = self.create_table()
        self.results_105 = self.create_table()
        self.results_85 = self.create_table()
        
        self.i =0

    def create_table(self):
        
        col_idx = pd.MultiIndex.from_product([self.modeltype,self.index],names = ["Modeltype","Entropy Buckets"])
        row_idx = pd.MultiIndex.from_product([self.methods, self.data,self.compression],names = ["Methods","Dataset","Compression ratio"])
        df = pd.DataFrame(columns=col_idx, index=row_idx)
        df = df.fillna(0)
        return df

    def update(self, results, details):
        '''results is dictionary'''
        dataset, compression, modeltype, method, seed = details
        

        if seed==1:
            for k,v in results.items():
                index = (method, dataset, compression)
                columns = (modeltype, k)
                self.results_1.loc[index,columns] += v

        if seed==105:
             for k,v in results.items():
                index = (method, dataset, compression)
                columns = (modeltype, k)
                self.results_85.loc[index,columns] += v

        if seed == 85:
            for k,v in results.items():
                index = (method, dataset, compression)
                columns = (modeltype, k)
                self.results_105.loc[index,columns] += v
            
        self.i+=1
        if self.i%5==0:
            self.display_save()


    def display_save(self):
        # Create a new dataframe 'average' for jacard nodes
        self.average_results = self.create_table()
        for col in self.average_results.columns:
            self.average_results[col] = (self.results_1[col] + self.results_105[col] + self.results_85[col]) / 3

        # print("Jacard Nodes Similarity")
        # print(average_jac_nodes)
        self.average_results.to_csv("Avg_exp3_partB_results.csv")

        # save all seeds files
        output_folder = 'Entropy_rawFiles'
        os.makedirs(output_folder, exist_ok=True)
        self.results_1.to_csv("Entropy_rawFiles/entropyresults_1.csv")
        self.results_105.to_csv("Entropy_rawFiles/entropyresults_105.csv")
        self.results_85.to_csv("Entropy_rawFiles/entropyresults_85.csv")

In [7]:
a = exp()

In [8]:
# a.out.results_105
# a.binput.boundaries

In [9]:
#dataname, compression,method,seed(not to be used)
# pathlist = {("cora",0.5,"sgdd"):["distil_data/sgdd_adj_cora_0.5_1.pt","distil_data/sgdd_feat_cora_0.5_1.pt"],
#             ("cora",0.5,"gcond"):["distil_data/adj_cora_0.5_1_0.pt","distil_data/feat_cora_0.5_1_0.pt" ]
#            }

In [10]:
pathlist = {('citeseer', 'gcond', 0.125, 15): ['distil_data/gcond/adj_citeseer_0.125_15_0.pt', 'distil_data/gcond/feat_citeseer_0.125_15_0.pt'], ('citeseer', 'gcond', 0.125, 1): ['distil_data/gcond/adj_citeseer_0.125_1_0.pt', 'distil_data/gcond/feat_citeseer_0.125_1_0.pt'], ('citeseer', 'gcond', 0.125, 85): ['distil_data/gcond/adj_citeseer_0.125_85_0.pt', 'distil_data/gcond/feat_citeseer_0.125_85_0.pt'], ('citeseer', 'gcond', 0.25, 1): ['distil_data/gcond/adj_citeseer_0.25_1_0.pt', 'distil_data/gcond/feat_citeseer_0.25_1_0.pt'], ('citeseer', 'gcond', 0.25, 15): ['distil_data/gcond/adj_citeseer_0.25_15_0.pt', 'distil_data/gcond/feat_citeseer_0.25_15_0.pt'], ('citeseer', 'gcond', 0.25, 85): ['distil_data/gcond/adj_citeseer_0.25_85_0.pt', 'distil_data/gcond/feat_citeseer_0.25_85_0.pt'], ('citeseer', 'gcond', 0.5, 15): ['distil_data/gcond/adj_citeseer_0.5_15_0.pt', 'distil_data/gcond/feat_citeseer_0.5_15_0.pt'], ('citeseer', 'gcond', 0.5, 85): ['distil_data/gcond/adj_citeseer_0.5_85_0.pt', 'distil_data/gcond/feat_citeseer_0.5_85_0.pt'], ('citeseer', 'gcond', 0.5, 1): ['distil_data/gcond/adj_citeseer_0.5_1_0.pt', 'distil_data/gcond/feat_citeseer_0.5_1_0.pt'], ('cora', 'gcond', 0.125, 15): ['distil_data/gcond/adj_cora_0.125_15_0.pt', 'distil_data/gcond/feat_cora_0.125_15_0.pt'], ('cora', 'gcond', 0.125, 1): ['distil_data/gcond/adj_cora_0.125_1_0.pt', 'distil_data/gcond/feat_cora_0.125_1_0.pt'], ('cora', 'gcond', 0.125, 85): ['distil_data/gcond/adj_cora_0.125_85_0.pt', 'distil_data/gcond/feat_cora_0.125_85_0.pt'], ('cora', 'gcond', 0.25, 15): ['distil_data/gcond/adj_cora_0.25_15_0.pt', 'distil_data/gcond/feat_cora_0.25_15_0.pt'], ('cora', 'gcond', 0.25, 85): ['distil_data/gcond/adj_cora_0.25_85_0.pt', 'distil_data/gcond/feat_cora_0.25_85_0.pt'], ('cora', 'gcond', 0.25, 1): ['distil_data/gcond/adj_cora_0.25_1_0.pt', 'distil_data/gcond/feat_cora_0.25_1_0.pt'], ('cora', 'gcond', 0.5, 85): ['distil_data/gcond/adj_cora_0.5_85_0.pt', 'distil_data/gcond/feat_cora_0.5_85_0.pt'], ('cora', 'gcond', 0.5, 15): ['distil_data/gcond/adj_cora_0.5_15_0.pt', 'distil_data/gcond/feat_cora_0.5_15_0.pt'], ('cora', 'gcond', 0.5, 1): ['distil_data/gcond/adj_cora_0.5_1_0.pt', 'distil_data/gcond/feat_cora_0.5_1_0.pt'],
           ('citeseer', 'one_step', 0.125, 15): ['distil_data/one_step/adj_citeseer_0.125_15_1.pt', 'distil_data/one_step/feat_citeseer_0.125_15_1.pt'], ('citeseer', 'one_step', 0.125, 85): ['distil_data/one_step/adj_citeseer_0.125_85_1.pt', 'distil_data/one_step/feat_citeseer_0.125_85_1.pt'], ('citeseer', 'one_step', 0.125, 1): ['distil_data/one_step/adj_citeseer_0.125_1_1.pt', 'distil_data/one_step/feat_citeseer_0.125_1_1.pt'], ('citeseer', 'one_step', 0.25, 85): ['distil_data/one_step/adj_citeseer_0.25_85_1.pt', 'distil_data/one_step/feat_citeseer_0.25_85_1.pt'], ('citeseer', 'one_step', 0.25, 15): ['distil_data/one_step/adj_citeseer_0.25_15_1.pt', 'distil_data/one_step/feat_citeseer_0.25_15_1.pt'], ('citeseer', 'one_step', 0.25, 1): ['distil_data/one_step/adj_citeseer_0.25_1_1.pt', 'distil_data/one_step/feat_citeseer_0.25_1_1.pt'], ('citeseer', 'one_step', 0.5, 15): ['distil_data/one_step/adj_citeseer_0.5_15_1.pt', 'distil_data/one_step/feat_citeseer_0.5_15_1.pt'], ('citeseer', 'one_step', 0.5, 85): ['distil_data/one_step/adj_citeseer_0.5_85_1.pt', 'distil_data/one_step/feat_citeseer_0.5_85_1.pt'], ('citeseer', 'one_step', 0.5, 1): ['distil_data/one_step/adj_citeseer_0.5_1_1.pt', 'distil_data/one_step/feat_citeseer_0.5_1_1.pt'], ('cora', 'one_step', 0.125, 85): ['distil_data/one_step/adj_cora_0.125_85_1.pt', 'distil_data/one_step/feat_cora_0.125_85_1.pt'], ('cora', 'one_step', 0.125, 15): ['distil_data/one_step/adj_cora_0.125_15_1.pt', 'distil_data/one_step/feat_cora_0.125_15_1.pt'], ('cora', 'one_step', 0.125, 1): ['distil_data/one_step/adj_cora_0.125_1_1.pt', 'distil_data/one_step/feat_cora_0.125_1_1.pt'], ('cora', 'one_step', 0.25, 85): ['distil_data/one_step/adj_cora_0.25_85_1.pt', 'distil_data/one_step/feat_cora_0.25_85_1.pt'], ('cora', 'one_step', 0.25, 15): ['distil_data/one_step/adj_cora_0.25_15_1.pt', 'distil_data/one_step/feat_cora_0.25_15_1.pt'], ('cora', 'one_step', 0.25, 1): ['distil_data/one_step/adj_cora_0.25_1_1.pt', 'distil_data/one_step/feat_cora_0.25_1_1.pt'], ('cora', 'one_step', 0.5, 15): ['distil_data/one_step/adj_cora_0.5_15_1.pt', 'distil_data/one_step/feat_cora_0.5_15_1.pt'], ('cora', 'one_step', 0.5, 85): ['distil_data/one_step/adj_cora_0.5_85_1.pt', 'distil_data/one_step/feat_cora_0.5_85_1.pt'], ('cora', 'one_step', 0.5, 1): ['distil_data/one_step/adj_cora_0.5_1_1.pt', 'distil_data/one_step/feat_cora_0.5_1_1.pt'],
          ('citeseer', 'sfgc', 0.125, 37): ['distil_data/sfgc/adj_citeseer_0.125_best_ntk_score_37.pt', 'distil_data/sfgc/feat_citeseer_0.125_best_ntk_score_37.pt'], ('citeseer', 'sfgc', 0.125, 15): ['distil_data/sfgc/adj_citeseer_0.125_best_ntk_score_15.pt', 'distil_data/sfgc/feat_citeseer_0.125_best_ntk_score_15.pt'], ('citeseer', 'sfgc', 0.125, 31): ['distil_data/sfgc/adj_citeseer_0.125_best_ntk_score_31.pt', 'distil_data/sfgc/feat_citeseer_0.125_best_ntk_score_31.pt'], ('citeseer', 'sfgc', 0.25, 31): ['distil_data/sfgc/adj_citeseer_0.25_best_ntk_score_31.pt', 'distil_data/sfgc/feat_citeseer_0.25_best_ntk_score_31.pt'], ('citeseer', 'sfgc', 0.25, 15): ['distil_data/sfgc/adj_citeseer_0.25_best_ntk_score_15.pt', 'distil_data/sfgc/feat_citeseer_0.25_best_ntk_score_15.pt'], ('citeseer', 'sfgc', 0.25, 37): ['distil_data/sfgc/adj_citeseer_0.25_best_ntk_score_37.pt', 'distil_data/sfgc/feat_citeseer_0.25_best_ntk_score_37.pt'], ('citeseer', 'sfgc', 0.5, 15): ['distil_data/sfgc/adj_citeseer_0.5_best_ntk_score_15.pt', 'distil_data/sfgc/feat_citeseer_0.5_best_ntk_score_15.pt'], ('citeseer', 'sfgc', 0.5, 31): ['distil_data/sfgc/adj_citeseer_0.5_best_ntk_score_31.pt', 'distil_data/sfgc/feat_citeseer_0.5_best_ntk_score_31.pt'], ('citeseer', 'sfgc', 0.5, 37): ['distil_data/sfgc/adj_citeseer_0.5_best_ntk_score_37.pt', 'distil_data/sfgc/feat_citeseer_0.5_best_ntk_score_37.pt'], ('cora', 'sfgc', 0.125, 15): ['distil_data/sfgc/adj_cora_0.125_best_ntk_score_15.pt', 'distil_data/sfgc/feat_cora_0.125_best_ntk_score_15.pt'], ('cora', 'sfgc', 0.125, 37): ['distil_data/sfgc/adj_cora_0.125_best_ntk_score_37.pt', 'distil_data/sfgc/feat_cora_0.125_best_ntk_score_37.pt'], ('cora', 'sfgc', 0.125, 31): ['distil_data/sfgc/adj_cora_0.125_best_ntk_score_31.pt', 'distil_data/sfgc/feat_cora_0.125_best_ntk_score_31.pt'], ('cora', 'sfgc', 0.25, 15): ['distil_data/sfgc/adj_cora_0.25_best_ntk_score_15.pt', 'distil_data/sfgc/feat_cora_0.25_best_ntk_score_15.pt'], ('cora', 'sfgc', 0.25, 37): ['distil_data/sfgc/adj_cora_0.25_best_ntk_score_37.pt', 'distil_data/sfgc/feat_cora_0.25_best_ntk_score_37.pt'], ('cora', 'sfgc', 0.25, 31): ['distil_data/sfgc/adj_cora_0.25_best_ntk_score_31.pt', 'distil_data/sfgc/feat_cora_0.25_best_ntk_score_31.pt'], ('cora', 'sfgc', 0.5, 15): ['distil_data/sfgc/adj_cora_0.5_best_ntk_score_15.pt', 'distil_data/sfgc/feat_cora_0.5_best_ntk_score_15.pt'], ('cora', 'sfgc', 0.5, 37): ['distil_data/sfgc/adj_cora_0.5_best_ntk_score_37.pt', 'distil_data/sfgc/feat_cora_0.5_best_ntk_score_37.pt'], ('cora', 'sfgc', 0.5, 31): ['distil_data/sfgc/adj_cora_0.5_best_ntk_score_31.pt', 'distil_data/sfgc/feat_cora_0.5_best_ntk_score_31.pt'],
            ('citeseer', 'sgdd', 0.125, 1000): ['distil_data/sgdd/adj_citeseer_0.125_1000.pt', 'distil_data/sgdd/feat_citeseer_0.125_1000.pt'], ('citeseer', 'sgdd', 0.125, 1): ['distil_data/sgdd/adj_citeseer_0.125_1.pt', 'distil_data/sgdd/feat_citeseer_0.125_1.pt'], ('citeseer', 'sgdd', 0.125, 120): ['distil_data/sgdd/adj_citeseer_0.125_120.pt', 'distil_data/sgdd/feat_citeseer_0.125_120.pt'], ('citeseer', 'sgdd', 0.125, 85): ['distil_data/sgdd/adj_citeseer_0.125_85.pt', 'distil_data/sgdd/feat_citeseer_0.125_85.pt'], ('citeseer', 'sgdd', 0.125, 15): ['distil_data/sgdd/adj_citeseer_0.125_15.pt', 'distil_data/sgdd/feat_citeseer_0.125_15.pt'], ('citeseer', 'sgdd', 0.25, 1): ['distil_data/sgdd/adj_citeseer_0.25_1.pt', 'distil_data/sgdd/feat_citeseer_0.25_1.pt'], ('citeseer', 'sgdd', 0.25, 120): ['distil_data/sgdd/adj_citeseer_0.25_120.pt', 'distil_data/sgdd/feat_citeseer_0.25_120.pt'], ('citeseer', 'sgdd', 0.25, 15): ['distil_data/sgdd/adj_citeseer_0.25_15.pt', 'distil_data/sgdd/feat_citeseer_0.25_15.pt'], ('citeseer', 'sgdd', 0.25, 1000): ['distil_data/sgdd/adj_citeseer_0.25_1000.pt', 'distil_data/sgdd/feat_citeseer_0.25_1000.pt'], ('citeseer', 'sgdd', 0.25, 85): ['distil_data/sgdd/adj_citeseer_0.25_85.pt', 'distil_data/sgdd/feat_citeseer_0.25_85.pt'], ('citeseer', 'sgdd', 0.5, 1000): ['distil_data/sgdd/adj_citeseer_0.5_1000.pt', 'distil_data/sgdd/feat_citeseer_0.5_1000.pt'], ('citeseer', 'sgdd', 0.5, 1): ['distil_data/sgdd/adj_citeseer_0.5_1.pt', 'distil_data/sgdd/feat_citeseer_0.5_1.pt'], ('citeseer', 'sgdd', 0.5, 120): ['distil_data/sgdd/adj_citeseer_0.5_120.pt', 'distil_data/sgdd/feat_citeseer_0.5_120.pt'], ('citeseer', 'sgdd', 0.5, 15): ['distil_data/sgdd/adj_citeseer_0.5_15.pt', 'distil_data/sgdd/feat_citeseer_0.5_15.pt'], ('citeseer', 'sgdd', 0.5, 85): ['distil_data/sgdd/adj_citeseer_0.5_85.pt', 'distil_data/sgdd/feat_citeseer_0.5_85.pt'], ('cora', 'sgdd', 0.125, 1): ['distil_data/sgdd/adj_cora_0.125_1.pt', 'distil_data/sgdd/feat_cora_0.125_1.pt'], ('cora', 'sgdd', 0.125, 1000): ['distil_data/sgdd/adj_cora_0.125_1000.pt', 'distil_data/sgdd/feat_cora_0.125_1000.pt'], ('cora', 'sgdd', 0.125, 120): ['distil_data/sgdd/adj_cora_0.125_120.pt', 'distil_data/sgdd/feat_cora_0.125_120.pt'], ('cora', 'sgdd', 0.125, 15): ['distil_data/sgdd/adj_cora_0.125_15.pt', 'distil_data/sgdd/feat_cora_0.125_15.pt'], ('cora', 'sgdd', 0.125, 85): ['distil_data/sgdd/adj_cora_0.125_85.pt', 'distil_data/sgdd/feat_cora_0.125_85.pt'], ('cora', 'sgdd', 0.25, 1): ['distil_data/sgdd/adj_cora_0.25_1.pt', 'distil_data/sgdd/feat_cora_0.25_1.pt'], ('cora', 'sgdd', 0.25, 1000): ['distil_data/sgdd/adj_cora_0.25_1000.pt', 'distil_data/sgdd/feat_cora_0.25_1000.pt'], ('cora', 'sgdd', 0.25, 120): ['distil_data/sgdd/adj_cora_0.25_120.pt', 'distil_data/sgdd/feat_cora_0.25_120.pt'], ('cora', 'sgdd', 0.25, 15): ['distil_data/sgdd/adj_cora_0.25_15.pt', 'distil_data/sgdd/feat_cora_0.25_15.pt'], ('cora', 'sgdd', 0.25, 85): ['distil_data/sgdd/adj_cora_0.25_85.pt', 'distil_data/sgdd/feat_cora_0.25_85.pt'], ('cora', 'sgdd', 0.5, 1): ['distil_data/sgdd/adj_cora_0.5_1.pt', 'distil_data/sgdd/feat_cora_0.5_1.pt'], ('cora', 'sgdd', 0.5, 1000): ['distil_data/sgdd/adj_cora_0.5_1000.pt', 'distil_data/sgdd/feat_cora_0.5_1000.pt'], ('cora', 'sgdd', 0.5, 120): ['distil_data/sgdd/adj_cora_0.5_120.pt', 'distil_data/sgdd/feat_cora_0.5_120.pt'], ('cora', 'sgdd', 0.5, 15): ['distil_data/sgdd/adj_cora_0.5_15.pt', 'distil_data/sgdd/feat_cora_0.5_15.pt'], ('cora', 'sgdd', 0.5, 85): ['distil_data/sgdd/adj_cora_0.5_85.pt', 'distil_data/sgdd/feat_cora_0.5_85.pt'],
           }

In [11]:
a.run_experiment(pathlist)

citeseer 0.125 gcond 15
citeseer 0.125 gcond 1
citeseer 0.125 gcond 85
citeseer 0.25 gcond 1
citeseer 0.25 gcond 15
citeseer 0.25 gcond 85
citeseer 0.5 gcond 15
citeseer 0.5 gcond 85
citeseer 0.5 gcond 1
cora 0.125 gcond 15
cora 0.125 gcond 1
cora 0.125 gcond 85
cora 0.25 gcond 15
cora 0.25 gcond 85
cora 0.25 gcond 1
cora 0.5 gcond 85
cora 0.5 gcond 15
cora 0.5 gcond 1
citeseer 0.125 one_step 15
citeseer 0.125 one_step 85
citeseer 0.125 one_step 1
citeseer 0.25 one_step 85
citeseer 0.25 one_step 15
citeseer 0.25 one_step 1
citeseer 0.5 one_step 15
citeseer 0.5 one_step 85
citeseer 0.5 one_step 1
cora 0.125 one_step 85
cora 0.125 one_step 15
cora 0.125 one_step 1
cora 0.25 one_step 85
cora 0.25 one_step 15
cora 0.25 one_step 1
cora 0.5 one_step 15
cora 0.5 one_step 85
cora 0.5 one_step 1
citeseer 0.125 sfgc 37
citeseer 0.125 sfgc 15
citeseer 0.125 sfgc 31
citeseer 0.25 sfgc 31
citeseer 0.25 sfgc 15
citeseer 0.25 sfgc 37
citeseer 0.5 sfgc 15
citeseer 0.5 sfgc 31
citeseer 0.5 sfgc 37
cora

In [12]:
#             adj_path =   "distil_data/sgdd_adj_cora_0.5_1.pt" #"distil_data/adj_cora_0.5_1_0.pt"
#             feat_path =  "distil_data/sgdd_feat_cora_0.5_1.pt" #"distil_data/feat_cora_0.5_1_0.pt" 

In [13]:
a.out.results_85

Unnamed: 0_level_0,Unnamed: 1_level_0,Modeltype,gcn,gcn,gcn,gcn,gat,gat,gat,gat
Unnamed: 0_level_1,Unnamed: 1_level_1,Entropy Buckets,low,mid,high,mean,low,mid,high,mean
Methods,Dataset,Compression ratio,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
one_step,cora,0.125,9,14,28,3.442353,1,3,47,6.343971
one_step,cora,0.25,60,33,12,1.384152,1,7,97,5.559464
one_step,cora,0.5,89,65,56,1.918847,1,0,209,5.994501
one_step,citeseer,0.125,7,29,9,5.635073,0,3,42,7.611097
one_step,citeseer,0.25,83,7,0,3.431211,0,11,79,7.554389
one_step,citeseer,0.5,162,18,0,2.773241,0,5,175,7.599912
gcond,cora,0.125,0,0,51,6.716129,0,0,51,7.165892
gcond,cora,0.25,0,1,104,6.619936,0,0,105,7.643294
gcond,cora,0.5,0,8,202,6.174155,0,0,210,7.742887
gcond,citeseer,0.125,0,0,45,7.613433,0,0,45,7.724788


In [14]:
a.out.average_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Modeltype,gcn,gcn,gcn,gcn,gat,gat,gat,gat
Unnamed: 0_level_1,Unnamed: 1_level_1,Entropy Buckets,low,mid,high,mean,low,mid,high,mean
Methods,Dataset,Compression ratio,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
one_step,cora,0.125,9.333333,12.333333,29.333333,3.80658,0.666667,2.666667,47.666667,6.742618
one_step,cora,0.25,61.666667,30.333333,13.0,1.753889,0.333333,5.333333,99.333333,6.093915
one_step,cora,0.5,92.0,75.0,43.0,2.129175,1.0,0.666667,208.333333,6.519339
one_step,citeseer,0.125,8.333333,28.0,8.666667,4.736338,1.333333,4.333333,39.333333,7.175711
one_step,citeseer,0.25,83.333333,6.666667,0.0,2.411532,1.666667,23.666667,64.666667,6.899788
one_step,citeseer,0.5,163.333333,16.666667,0.0,1.934453,0.666667,12.0,167.333333,7.137544
gcond,cora,0.125,0.0,0.0,51.0,6.812994,0.0,0.0,51.0,7.471947
gcond,cora,0.25,0.0,0.666667,104.333333,6.758482,0.0,0.0,105.0,7.838437
gcond,cora,0.5,0.0,8.0,202.0,6.310214,0.0,0.0,210.0,7.915266
gcond,citeseer,0.125,0.0,0.0,45.0,7.519919,0.0,0.0,45.0,7.661268


In [15]:
# dataname,mtype_lower,seed = "cora","gcn",1
# dataset = Planetoid(root='data', name= dataname)
# if mtype_lower=="gcn":
#     model = GCN(dataset.num_features,dataset.num_classes)
# elif mtype_lower=="gat":
#     model = GAT(dataset.num_features,dataset.num_classes)
# model.load_state_dict(torch.load(f'Bench_model_{dataname}_{mtype_lower}_{seed}.pt'))

# # B) find entropy of all training points of cora dataset. (sort them)
# # make model fixed and extract dat
# model.eval().to(device)
# data= dataset[0].to(device)
# print("-- at Find Entropy, datamask shape", data.train_mask.shape)
# # need a forward pass to find score and embed
# score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

# # need to apply softmax to convert scores to probability
# runsoftmax = torch.nn.Softmax(dim=1)
# nprob = runsoftmax(score)
# trainmask = data.train_mask
# nprob_train = nprob[trainmask]

# # Calculate entropy for each data point
# entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
# entropy_data

# # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
# # Sort the tensor
# sorted_tensor, indices = torch.sort(entropy_data)
# # print("benchmark: ", sorted_tensor)
# # Calculate the range indices
# total_elements = len(sorted_tensor)
# low_range_end = total_elements // 3
# mid_range_end = 2 * (total_elements // 3)

# boundaries = [sorted_tensor[low_range_end].item(),sorted_tensor[mid_range_end].item()]

# low = [sorted_tensor[0],sorted_tensor[low_range_end]]
# mid = [sorted_tensor[low_range_end], sorted_tensor[mid_range_end]]
# high= [sorted_tensor[-1],sorted_tensor[mid_range_end]]

# # print("range buckets are: ",low , mid, high, boundaries )

In [16]:
# G = nx.Graph()
# G.add_nodes_from([1,2])
# G.add_edges_from([(1,2),(2,1)])
# G
# dataname,mtype_lower,seed = "cora","gcn",1
# dataset = Planetoid(root='data', name= dataname)
# if mtype_lower=="gcn":
#     model = GCN(dataset.num_features,dataset.num_classes)
# elif mtype_lower=="gat":
#     model = GAT(dataset.num_features,dataset.num_classes)
# model.load_state_dict(torch.load(f'Bench_model_{dataname}_{mtype_lower}_{seed}.pt'))

# B) find entropy of all training points of cora dataset. (sort them)
# make model fixed and extract dat

# model.eval().to(device)
# data= dataset[0].to(device)
# print("-- at Find Entropy, datamask shape", data.train_mask.shape)
# # need a forward pass to find score and embed
# score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

# # need to apply softmax to convert scores to probability
# runsoftmax = torch.nn.Softmax(dim=1)
# nprob = runsoftmax(score)
# trainmask = data.train_mask
# nprob_train = nprob[trainmask]

# # Calculate entropy for each data point
# entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
# entropy_data

# # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
# # Sort the tensor
# sorted_tensor, indices = torch.sort(entropy_data)
# # print("benchmark: ", sorted_tensor)
# # Calculate the range indices
# total_elements = len(sorted_tensor)
# low_range_end = total_elements // 3
# mid_range_end = 2 * (total_elements // 3)

# boundaries = [sorted_tensor[low_range_end].item(),sorted_tensor[mid_range_end].item()]

# low = [sorted_tensor[0],sorted_tensor[low_range_end]]
# mid = [sorted_tensor[low_range_end], sorted_tensor[mid_range_end]]
# high= [sorted_tensor[-1],sorted_tensor[mid_range_end]]

# print("range buckets are: ",low , mid, high, boundaries )

# dataname,mtype_lower,seed = "cora","gcn",1
# dataset = Planetoid(root='data', name= dataname)
# if mtype_lower=="gcn":
#     model = GCN(dataset.num_features,dataset.num_classes)
# elif mtype_lower=="gat":
#     model = GAT(dataset.num_features,dataset.num_classes)
# model.load_state_dict(torch.load(f'Bench_model_{dataname}_{mtype_lower}_{seed}.pt'))

# # B) find entropy of all training points of cora dataset. (sort them)
# # make model fixed and extract dat
# model.eval().to(device)
# data= dataset[0].to(device)
# print("-- at Find Entropy, datamask shape", data.train_mask.shape)
# # need a forward pass to find score and embed
# score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

# # need to apply softmax to convert scores to probability
# runsoftmax = torch.nn.Softmax(dim=1)
# nprob = runsoftmax(score)
# trainmask = data.train_mask
# nprob_train = nprob[trainmask]

# # Calculate entropy for each data point
# entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
# entropy_data

# # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
# # Sort the tensor
# sorted_tensor, indices = torch.sort(entropy_data)
# # print("benchmark: ", sorted_tensor)
# # Calculate the range indices
# total_elements = len(sorted_tensor)
# low_range_end = total_elements // 3
# mid_range_end = 2 * (total_elements // 3)

# boundaries = [sorted_tensor[low_range_end].item(),sorted_tensor[mid_range_end].item()]

# low = [sorted_tensor[0],sorted_tensor[low_range_end]]
# mid = [sorted_tensor[low_range_end], sorted_tensor[mid_range_end]]
# high= [sorted_tensor[-1],sorted_tensor[mid_range_end]]

# print("range buckets are: ", boundaries )


# # load a distilled dataset - cora/gcond/0.5/seed
# adj_path =    "distil_data/adj_cora_0.5_1_0.pt" #"distil_data/sgdd_adj_cora_0.5_1.pt"
# feat_path =   "distil_data/feat_cora_0.5_1_0.pt" #"distil_data/sgdd_feat_cora_0.5_1.pt"
# adj = torch.load(adj_path)
# feat = torch.load(feat_path).to(device)
# adj[adj<0.5]=0   # remove anything spurious

# # Convert to a NetworkX graph
# graph = nx.Graph(adj.cpu().numpy())

# # Get the edge list (edge index)
# edge_list = list(graph.edges)

# # Convert the edge list to a PyTorch tensor
# edge_index = torch.tensor(edge_list).t().contiguous().to(device)

# # Extract edge attributes (probabilities) from the adjacency matrix
# edge_attr = []
# for edge in edge_index.t().tolist():
#     # Assuming the adjacency matrix is a NumPy array
#     edge_attr.append(adj[edge[0], edge[1]])

# # Convert the edge_attr list to a PyTorch tensor
# edge_attr = torch.tensor(edge_attr, dtype=torch.float32)

# graph_data = Data(x=feat, edge_index=edge_index, edge_attr=edge_attr)
# # print(graph_data.edge_attr)

# data= graph_data.to(device)


# # need a forward pass to find score and embed
# score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

# # need to apply softmax to convert scores to probability
# runsoftmax = torch.nn.Softmax(dim=1)
# nprob = runsoftmax(score)
# nprob_train = nprob

# # Calculate entropy for each data point
# entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
# # print(entropy_data)

# # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
# # Sort the tensor
# sorted_tensor, indices = torch.sort(entropy_data)
# print(sorted_tensor)
# # Print - what % of points lie in each bucket.
# # Boolean indexing to filter values between 3 and 4
# low_values = (sorted_tensor < boundaries[0])
# mid_values = (sorted_tensor >= boundaries[0]) & (sorted_tensor < boundaries[1])
# high_values = (sorted_tensor >= boundaries[1])

# # Count the number of True values
# low_count = torch.sum(low_values).item()
# mid_count = torch.sum(mid_values).item()
# high_count = torch.sum(high_values).item()

# print(low_count, mid_count, high_count)

In [17]:
# # I) Create entropy buckets
# # A) load a cora- Bench model of GCN
# dataname,mtype_lower,seed = "cora","gcn",1
# dataset = Planetoid(root='data', name= dataname)
# if mtype_lower=="gcn":
#     model = GCN(dataset.num_features,dataset.num_classes)
# elif mtype_lower=="gat":
#     model = GAT(dataset.num_features,dataset.num_classes)
# model.load_state_dict(torch.load(f'Bench_model_{dataname}_{mtype_lower}_{seed}.pt'))

# # B) find entropy of all training points of cora dataset. (sort them)
# # make model fixed and extract dat
# model.eval().to(device)
# data= dataset[0].to(device)
# print("-- at Find Entropy, datamask shape", data.train_mask.shape)
# # need a forward pass to find score and embed
# score, embed = model.forward(data.x, data.edge_index, data.edge_attr)

# # need to apply softmax to convert scores to probability
# runsoftmax = torch.nn.Softmax(dim=1)
# nprob = runsoftmax(score)
# trainmask = data.train_mask
# nprob_train = nprob[trainmask]

# # Calculate entropy for each data point
# entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)


# # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
# # Sort the tensor
# sorted_tensor, indices = torch.sort(entropy_data)
# print("benchmark: ", sorted_tensor)
# # Calculate the range indices
# total_elements = len(sorted_tensor)
# low_range_end = total_elements // 3
# mid_range_end = 2 * (total_elements // 3)

# boundaries = [sorted_tensor[low_range_end].item(),sorted_tensor[mid_range_end].item()]

# low = [sorted_tensor[0],sorted_tensor[low_range_end]]
# mid = [sorted_tensor[low_range_end], sorted_tensor[mid_range_end]]
# high= [sorted_tensor[-1],sorted_tensor[mid_range_end]]

# # print("range buckets are: ",low , mid, high, boundaries )

# # II) what % lies where
# # load a distilled dataset - cora/gcond/0.5/seed
# adj_path =  "distil_data/adj_cora_0.5_1_0.pt" #"distil_data/sgdd_adj_cora_0.5_1.pt"
# feat_path =  "distil_data/feat_cora_0.5_1_0.pt"  #"distil_data/sgdd_feat_cora_0.5_1.pt"
# adj = torch.load(adj_path)
# feat = torch.load(feat_path).to(device)
# adj[adj<0.05]=0   # remove anything spurious

# # Convert to a NetworkX graph
# graph = nx.Graph(adj.cpu().numpy())

# # Get the edge list (edge index)
# edge_list = list(graph.edges)

# # Convert the edge list to a PyTorch tensor
# edge_index = torch.tensor(edge_list).t().contiguous().to(device)

# # Extract edge attributes (probabilities) from the adjacency matrix
# edge_attr = []
# for edge in edge_index.t().tolist():
#     # Assuming the adjacency matrix is a NumPy array
#     edge_attr.append(adj[edge[0], edge[1]])

# # Convert the edge_attr list to a PyTorch tensor
# edge_attr = torch.tensor(edge_attr, dtype=torch.float32)

# graph_data = Data(x=feat, edge_index=edge_index, edge_attr=edge_attr)
# print(graph_data)


# # find entropy of each node using the bench model
# # dataset = graph_data
# data= graph_data.to(device)


# # need a forward pass to find score and embed
# score, embed = model.forward(data.x, data.edge_index,data.edge_attr)

# # need to apply softmax to convert scores to probability
# runsoftmax = torch.nn.Softmax(dim=1)
# nprob = runsoftmax(score)
# nprob_train = nprob

# # Calculate entropy for each data point
# entropy_data = -torch.sum(nprob_train * torch.log2(nprob_train + 1e-10), dim=1)
# # print(entropy_data)

# # C) create three buckets - low, mid, high entropy. (based on equal number of points in each)
# # Sort the tensor
# sorted_tensor, indices = torch.sort(entropy_data)
# print(sorted_tensor)
# # Print - what % of points lie in each bucket.
# # Boolean indexing to filter values between 3 and 4
# low_values = (sorted_tensor < boundaries[0])
# mid_values = (sorted_tensor >= boundaries[0]) & (sorted_tensor < boundaries[1])
# high_values = (sorted_tensor >= boundaries[1])

# # Count the number of True values
# low_count = torch.sum(low_values).item()
# mid_count = torch.sum(mid_values).item()
# high_count = torch.sum(high_values).item()

# print(boundaries)
# print(low_count, mid_count, high_count)

In [18]:
# G = nx.Graph()
# G.add_nodes_from([1,2])
# G.add_edges_from([(1,2),(2,1)])
# G

In [19]:
# torch.load("distil_data/sgdd_label_cora_0.5_1.pt")

In [20]:
# /home/evl/jkakka4/data/exp3/