In [1]:
DATASET = 'NCI109'
if 'first_run' not in globals():
    !pip install torch torch-geometric
    first_run = False
else:
    pass


Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [2]:
import hashlib
import numpy as np
import torch
from torch_geometric.transforms import BaseTransform
from torch_geometric.utils import to_scipy_sparse_matrix
from torch_geometric.data import Data
from torch_geometric.io import fs, read_txt_array
import torch_geometric.transforms as T
# from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data, InMemoryDataset
import math
from typing import Callable, List, Optional, Tuple, Dict
from torch import Tensor
import os
import requests
from torch_geometric.utils import coalesce, cumsum, one_hot, remove_self_loops

In [3]:
# class MyTransform(BaseTransform):
#     def __init__(self, set_ids = False, indices = None, set_initial_features=False, embedding=None, cos_sim=None, z_score=None):
#         super(MyTransform, self).__init__()
#         self.current_id = 0  # Initialize a counter for unique IDs
#         self.set_initial_features = set_initial_features
#         self.embedding = embedding  # Assume this is a function if not None
#         self.cos_sim = cos_sim      # Assume this is a dictionary or list if not None
#         self.z_score = z_score      # Assume this is a dictionary or list if not None
#         self.indices = indices
#         if set_ids:
#             self.assign_ids()

#     def assign_ids(self):
#         # Create an ID tensor for all data points
#         id_tensor = torch.arange(self.indices)
#         for i, data in enumerate(self):
#             data.id = id_tensor[i]

#     def __call__(self, data: Data) -> Data:
#         # Assign a unique ID and increment the counter
#         if self.set_initial_features:
#             data.id = self.current_id
#             self.current_id += 1
#             data.num_edges = data.edge_index.size(1)  # Number of edges
#             data.adj_mat = self.to_numpy_adj(data)  # Convert edge index to adjacency matrix

#         if self.embedding:
#             data.embedding = self.embedding[data.id] # Assume embedding function takes Data and modifies it

#         if self.cos_sim is not None:
#             data.cos_sim = self.cos_sim[data.id]  # Fetch cosine similarity based on ID

#         if self.z_score is not None:
#             data.z_score = self.z_score[data.id]  # Fetch z-score based on ID

#         return data

#     def to_numpy_adj(self, data):
#         # Creates an adjacency matrix from edge_index
#         adj_mat = np.zeros((data.num_nodes, data.num_nodes))
#         edge_index = data.edge_index.numpy()
#         adj_mat[edge_index[0], edge_index[1]] = 1
#         return adj_mat

In [4]:
class IDAddingDataset:
    def __init__(self, dataset, attr_func):
        self.dataset = dataset
        self.attr_func = attr_func
        self.indices = np.arange(0, len(dataset.indices()))
    
    def __getitem__(self, idx):
        data = self.dataset[idx].clone()  # Clone the data to avoid modifying the original dataset
        data.id = self.indices[idx]
        data.num_edges = self.dataset[idx].edge_index.size(1)
        data.adj_mat = to_scipy_sparse_matrix(self.dataset[idx].edge_index, num_nodes=self.dataset[idx].num_nodes).toarray().astype(int)
        return data
    
    def __len__(self):
        return len(self.dataset)


In [5]:
class EmbeddingAddingDataset:
    def __init__(self, dataset, embedding_dict):
        self.dataset = dataset
        self.embedding_dict = embedding_dict
    
    def __getitem__(self, idx):
        data = self.dataset[idx].clone()  # Clone the data to avoid modifying the original dataset
        data.embedding = self.embedding_dict[self.dataset[idx].id]
        return data
    
    def __len__(self):
        return len(self.dataset)

In [6]:
class FinalAddingDataset:
    def __init__(self, Embedding_dataset, cos_sims, all_z_scores):
        self.dataset = Embedding_dataset
        self.cos_sims = cos_sims
        self.all_z_scores = all_z_scores
        
    
    def __getitem__(self, idx):
        data = self.dataset[idx].clone()  # Clone the data to avoid modifying the original dataset
        data.cos_sim = self.cos_sims[self.dataset[idx].id]
        data.z_score = self.all_z_scores[self.dataset[idx].id]
        return data
    
    def __len__(self):
        return len(self.dataset)

In [7]:
def stable_hash(value):
    return hashlib.md5(str(value).encode()).hexdigest()
def get_WL_embedding(data, n_iter):
    graph = data.adj_mat
    graph_hash_dict = {}
    labels = [np.sum(graph[x]) for x in range(len(graph))]  # Initialize labels based on node degrees
    for _ in range(n_iter):
        neighbor_labels = [sorted([labels[j] for j in range(len(graph)) if graph[i, j] == 1]) for i in range(len(graph))]
        hashes = np.array([stable_hash((labels[i], tuple(neighbor_labels[i]))) for i in range(len(graph))])
        
        for unique_hash in set(hashes):
            graph_hash_dict[unique_hash] = np.sum(hashes == unique_hash)

        labels = hashes.tolist()

    return graph_hash_dict

In [8]:
def get_mean_vectors(dataset):
    all_hashes = set()
    all_classes = set()
    for graph in dataset:
        all_hashes.update(graph.embedding.keys())
        all_classes.add(graph.y.item())
    all_classes.add('all')

    vector_sum_dict = {_cls: {h: 0 for h in all_hashes} for _cls in all_classes}
    category_sum_dict = {_cls: 0 for _cls in all_classes}
    mean_vector_dict = {_cls: {h: 0 for h in all_hashes} for _cls in all_classes}
    
    
    for graph in dataset:
        for _hash, count in graph.embedding.items():
            vector_sum_dict[graph.y.item()][_hash] += count
            category_sum_dict[graph.y.item()] += 1
            vector_sum_dict['all'][_hash] += count
            category_sum_dict['all'] += 1
    for _cls, hash_counts in vector_sum_dict.items():
        for _hash, count in hash_counts.items():
            mean_vector_dict[_cls][_hash] = count/category_sum_dict[_cls]
    return all_hashes, all_classes, mean_vector_dict

def get_cos_sim(dataset, all_hashes, all_classes, mean_vectors):
    cos_sims, norm_dict = {}, {}
    graph_norm = math.sqrt(sum(count**2 for count in graph.embedding.values()))
    for _class in all_classes:
        norm_dict[_class] = math.sqrt(sum(count**2 for count in mean_vectors[_class].values()))
    all_numerator = sum(count * mean_vectors['all'][_hash] for _hash, count in graph.embedding.items())
    all_denom = norm_dict['all'] * graph_norm
    cos_sims['all'] = all_numerator / all_denom

    class_numerator = sum(count * mean_vectors[graph.y.item()][_hash] for _hash, count in graph.embedding.items())
    class_denom = norm_dict[graph.y.item()] * graph_norm
    cos_sims['class'] = class_numerator / class_denom
    
    return cos_sims

def calc_z_scores(graph, cos_sims, class_cos_sims):
    # Ensure data types are numpy arrays for statistical computation
    all_cos_sims = np.array(class_cos_sims['all'])
    cat_cos_sims = np.array(class_cos_sims[graph.y.item()])
    
    # Calculate means and standard deviations for 'all' and specific 'class'
    all_mean = np.mean(all_cos_sims)
    cat_mean = np.mean(cat_cos_sims)
    all_std_dev = np.std(all_cos_sims)
    class_std_dev = np.std(cat_cos_sims)
    
    # Compute z-scores using the standard formula, include flooring to nearest 0.5 if needed
    z_scores = {
        'all': (cos_sims[graph.id]['all']) / all_std_dev,
        'class': (cos_sims[graph.id]['class']) / class_std_dev
    }
    
    return z_scores

In [9]:
# from torch_geometric.io.tu import read_tu_data
# data, slices, sizes = read_tu_data('/kaggle/working/', 'ENZYMES')

In [10]:
# import os.path as osp
# from typing import Callable, List, Optional

# from torch_geometric.data import Data, InMemoryDataset
# from torch_geometric.io import fs#, read_tu_data

# def read_file(
#     folder: str,
#     prefix: str,
#     name: str,
#     dtype: Optional[torch.dtype] = None,
# ) -> Tensor:
#     path = osp.join(folder, f'{prefix}_{name}.txt')
#     return read_txt_array(path, sep=',', dtype=dtype)
# def cat(seq: List[Optional[Tensor]]) -> Optional[Tensor]:
#     values = [v for v in seq if v is not None]
#     values = [v for v in values if v.numel() > 0]
#     values = [v.unsqueeze(-1) if v.dim() == 1 else v for v in values]
#     return torch.cat(values, dim=-1) if len(values) > 0 else None
# def split(data: Data, batch: Tensor) -> Tuple[Data, Dict[str, Tensor]]:
#     node_slice = cumsum(torch.from_numpy(np.bincount(batch)))

#     assert data.edge_index is not None
#     row, _ = data.edge_index
#     edge_slice = cumsum(torch.from_numpy(np.bincount(batch[row])))

#     # Edge indices should start at zero for every graph.
#     data.edge_index -= node_slice[batch[row]].unsqueeze(0)

#     slices = {'edge_index': edge_slice}
#     if data.x is not None:
#         slices['x'] = node_slice
#     else:
#         # Imitate `collate` functionality:
#         data._num_nodes = torch.bincount(batch).tolist()
#         data.num_nodes = batch.numel()
#     if data.edge_attr is not None:
#         slices['edge_attr'] = edge_slice
#     if data.y is not None:
#         assert isinstance(data.y, Tensor)
#         if data.y.size(0) == batch.size(0):
#             slices['y'] = node_slice
#         else:
#             slices['y'] = torch.arange(0, int(batch[-1]) + 2, dtype=torch.long)

#     return data, slices
# def read_tu_data(
#     folder: str,
#     prefix: str,
# ) -> Tuple[Data, Dict[str, Tensor], Dict[str, int]]:
#     files = fs.glob(osp.join(folder, f'{prefix}_*.txt'))
#     names = [osp.basename(f)[len(prefix) + 1:-4] for f in files]

#     edge_index = read_file(folder, prefix, 'A', torch.long).t() - 1
#     batch = read_file(folder, prefix, 'graph_indicator', torch.long) - 1

#     node_attribute = torch.empty((batch.size(0), 0))
#     if 'node_attributes' in names:
#         node_attribute = read_file(folder, prefix, 'node_attributes')
#         if node_attribute.dim() == 1:
#             node_attribute = node_attribute.unsqueeze(-1)

#     node_label = torch.empty((batch.size(0), 0))
#     if 'node_labels' in names:
#         node_label = read_file(folder, prefix, 'node_labels', torch.long)
#         if node_label.dim() == 1:
#             node_label = node_label.unsqueeze(-1)
#         node_label = node_label - node_label.min(dim=0)[0]
#         node_labels = list(node_label.unbind(dim=-1))
#         node_labels = [one_hot(x) for x in node_labels]
#         if len(node_labels) == 1:
#             node_label = node_labels[0]
#         else:
#             node_label = torch.cat(node_labels, dim=-1)

#     edge_attribute = torch.empty((edge_index.size(1), 0))
#     if 'edge_attributes' in names:
#         edge_attribute = read_file(folder, prefix, 'edge_attributes')
#         if edge_attribute.dim() == 1:
#             edge_attribute = edge_attribute.unsqueeze(-1)

#     edge_label = torch.empty((edge_index.size(1), 0))
#     if 'edge_labels' in names:
#         edge_label = read_file(folder, prefix, 'edge_labels', torch.long)
#         if edge_label.dim() == 1:
#             edge_label = edge_label.unsqueeze(-1)
#         edge_label = edge_label - edge_label.min(dim=0)[0]
#         edge_labels = list(edge_label.unbind(dim=-1))
#         edge_labels = [one_hot(e) for e in edge_labels]
#         if len(edge_labels) == 1:
#             edge_label = edge_labels[0]
#         else:
#             edge_label = torch.cat(edge_labels, dim=-1)

#     x = cat([node_attribute, node_label])
#     edge_attr = cat([edge_attribute, edge_label])

#     y = None
#     if 'graph_attributes' in names:  # Regression problem.
#         y = read_file(folder, prefix, 'graph_attributes')
#     elif 'graph_labels' in names:  # Classification problem.
#         y = read_file(folder, prefix, 'graph_labels', torch.long)
#         _, y = y.unique(sorted=True, return_inverse=True)

#     num_nodes = int(edge_index.max()) + 1 if x is None else x.size(0)
#     edge_index, edge_attr = remove_self_loops(edge_index, edge_attr)
#     edge_index, edge_attr = coalesce(edge_index, edge_attr, num_nodes)
#     cos_sim = torch.tensor([-1.0])
#     z_score = torch.tensor([-1.0])

    
    
    
#     data = Data(
#         x=x,
#         edge_index=edge_index,
#         edge_attr=edge_attr,
#         y=y,
#         cos_sim=torch.tensor([-1.0]),  # Custom attribute
#         z_score=torch.tensor([-1.0])   # Custom attribute
#     )
#     data, slices = split(data, batch)

#     sizes = {
#         'num_node_attributes': node_attribute.size(-1),
#         'num_node_labels': node_label.size(-1),
#         'num_edge_attributes': edge_attribute.size(-1),
#         'num_edge_labels': edge_label.size(-1),
#     }

#     return data, slices, sizes

# class MyTUDataset(InMemoryDataset):
#     r"""A variety of graph kernel benchmark datasets, *.e.g.*,
#     :obj:`"IMDB-BINARY"`, :obj:`"REDDIT-BINARY"` or :obj:`"PROTEINS"`,
#     collected from the `TU Dortmund University
#     <https://chrsmrrs.github.io/datasets>`_.
#     In addition, this dataset wrapper provides `cleaned dataset versions
#     <https://github.com/nd7141/graph_datasets>`_ as motivated by the
#     `"Understanding Isomorphism Bias in Graph Data Sets"
#     <https://arxiv.org/abs/1910.12091>`_ paper, containing only non-isomorphic
#     graphs.

#     .. note::
#         Some datasets may not come with any node labels.
#         You can then either make use of the argument :obj:`use_node_attr`
#         to load additional continuous node attributes (if present) or provide
#         synthetic node features using transforms such as
#         :class:`torch_geometric.transforms.Constant` or
#         :class:`torch_geometric.transforms.OneHotDegree`.

#     Args:
#         root (str): Root directory where the dataset should be saved.
#         name (str): The `name
#             <https://chrsmrrs.github.io/datasets/docs/datasets/>`_ of the
#             dataset.
#         transform (callable, optional): A function/transform that takes in an
#             :obj:`torch_geometric.data.Data` object and returns a transformed
#             version. The data object will be transformed before every access.
#             (default: :obj:`None`)
#         pre_transform (callable, optional): A function/transform that takes in
#             an :obj:`torch_geometric.data.Data` object and returns a
#             transformed version. The data object will be transformed before
#             being saved to disk. (default: :obj:`None`)
#         pre_filter (callable, optional): A function that takes in an
#             :obj:`torch_geometric.data.Data` object and returns a boolean
#             value, indicating whether the data object should be included in the
#             final dataset. (default: :obj:`None`)
#         force_reload (bool, optional): Whether to re-process the dataset.
#             (default: :obj:`False`)
#         use_node_attr (bool, optional): If :obj:`True`, the dataset will
#             contain additional continuous node attributes (if present).
#             (default: :obj:`False`)
#         use_edge_attr (bool, optional): If :obj:`True`, the dataset will
#             contain additional continuous edge attributes (if present).
#             (default: :obj:`False`)
#         cleaned (bool, optional): If :obj:`True`, the dataset will
#             contain only non-isomorphic graphs. (default: :obj:`False`)

#     **STATS:**

#     .. list-table::
#         :widths: 20 10 10 10 10 10
#         :header-rows: 1

#         * - Name
#           - #graphs
#           - #nodes
#           - #edges
#           - #features
#           - #classes
#         * - MUTAG
#           - 188
#           - ~17.9
#           - ~39.6
#           - 7
#           - 2
#         * - ENZYMES
#           - 600
#           - ~32.6
#           - ~124.3
#           - 3
#           - 6
#         * - PROTEINS
#           - 1,113
#           - ~39.1
#           - ~145.6
#           - 3
#           - 2
#         * - COLLAB
#           - 5,000
#           - ~74.5
#           - ~4914.4
#           - 0
#           - 3
#         * - IMDB-BINARY
#           - 1,000
#           - ~19.8
#           - ~193.1
#           - 0
#           - 2
#         * - REDDIT-BINARY
#           - 2,000
#           - ~429.6
#           - ~995.5
#           - 0
#           - 2
#         * - ...
#           -
#           -
#           -
#           -
#           -
#     """

#     url = 'https://www.chrsmrrs.com/graphkerneldatasets'
#     cleaned_url = ('https://raw.githubusercontent.com/nd7141/'
#                    'graph_datasets/master/datasets')

#     def __init__(
#         self,
#         root: str,
#         name: str,
#         transform: Optional[Callable] = None,
#         pre_transform: Optional[Callable] = None,
#         pre_filter: Optional[Callable] = None,
#         force_reload: bool = False,
#         use_node_attr: bool = False,
#         use_edge_attr: bool = False,
#         cleaned: bool = False,
#     ) -> None:
#         self.name = name
#         self.cleaned = cleaned
#         super().__init__(root, transform, pre_transform, pre_filter,
#                          force_reload=force_reload)

#         out = fs.torch_load(self.processed_paths[0])
#         if not isinstance(out, tuple) or len(out) < 3:
#             raise RuntimeError(
#                 "The 'data' object was created by an older version of PyG. "
#                 "If this error occurred while loading an already existing "
#                 "dataset, remove the 'processed/' directory in the dataset's "
#                 "root folder and try again.")
#         assert len(out) == 3 or len(out) == 4

#         if len(out) == 3:  # Backward compatibility.
#             data, self.slices, self.sizes = out
#             data_cls = Data
#         else:
#             data, self.slices, self.sizes, data_cls = out

#         if not isinstance(data, dict):  # Backward compatibility.
#             self.data = data
#         else:
#             self.data = data_cls.from_dict(data)

#         assert isinstance(self._data, Data)
#         if self._data.x is not None and not use_node_attr:
#             num_node_attributes = self.num_node_attributes
#             self._data.x = self._data.x[:, num_node_attributes:]
#         if self._data.edge_attr is not None and not use_edge_attr:
#             num_edge_attrs = self.num_edge_attributes
#             self._data.edge_attr = self._data.edge_attr[:, num_edge_attrs:]

#     @property
#     def raw_dir(self) -> str:
#         name = f'raw{"_cleaned" if self.cleaned else ""}'
#         return osp.join(self.root, self.name, name)

#     @property
#     def processed_dir(self) -> str:
#         name = f'processed{"_cleaned" if self.cleaned else ""}'
#         return osp.join(self.root, self.name, name)

#     @property
#     def num_node_labels(self) -> int:
#         return self.sizes['num_node_labels']

#     @property
#     def num_node_attributes(self) -> int:
#         return self.sizes['num_node_attributes']

#     @property
#     def num_edge_labels(self) -> int:
#         return self.sizes['num_edge_labels']

#     @property
#     def num_edge_attributes(self) -> int:
#         return self.sizes['num_edge_attributes']

#     @property
#     def raw_file_names(self) -> List[str]:
#         names = ['A', 'graph_indicator']
#         return [f'{self.name}_{name}.txt' for name in names]

#     @property
#     def processed_file_names(self) -> str:
#         return 'data.pt'
        
        
#         #Original Function
# #     def download(self) -> None:
# #         url = self.cleaned_url if self.cleaned else self.url
# #         fs.cp(f'{url}/{self.name}.zip', self.raw_dir, extract=True)
# #         for filename in fs.ls(osp.join(self.raw_dir, self.name)):
# #             fs.mv(filename, osp.join(self.raw_dir, osp.basename(filename)))
# #         fs.rm(osp.join(self.raw_dir, self.name))
        
#     #Dummy Function for already downloaded files
#     def download(self) -> None:
#         files = {
#     'ENZYMES_A.txt': 'https://raw.githubusercontent.com/snap-stanford/GraphRNN/master/dataset/ENZYMES/ENZYMES_A.txt',
#     'ENZYMES_graph_indicator.txt': 'https://raw.githubusercontent.com/snap-stanford/GraphRNN/master/dataset/ENZYMES/ENZYMES_graph_indicator.txt',
#     'ENZYMES_graph_labels.txt': 'https://raw.githubusercontent.com/snap-stanford/GraphRNN/master/dataset/ENZYMES/ENZYMES_graph_labels.txt',
#     'ENZYMES_node_attributes.txt': 'https://raw.githubusercontent.com/snap-stanford/GraphRNN/master/dataset/ENZYMES/ENZYMES_node_attributes.txt',
#     'ENZYMES_node_labels.txt': 'https://raw.githubusercontent.com/snap-stanford/GraphRNN/master/dataset/ENZYMES/ENZYMES_node_labels.txt'
#     }
#         dest_dir = '/kaggle/working/ENZYMES/raw/'

#         # Ensure destination directory exists
#         if not os.path.exists(dest_dir):
#             os.makedirs(dest_dir)

#         # Function to download and save a file
#         def download_and_save(url, destination):
#             # Make the HTTP GET request to the file URL
#             if os.dir.exists()
#             response = requests.get(url)
#             if response.status_code == 200:
#                 # Write the file contents in binary mode
#                 filename = os.path.join(destination, url.split('/')[-1])
#                 with open(filename, 'wb') as file:
#                     file.write(response.content)
#                 print(f"File saved as {filename}")
#             else:
#                 print(f"Failed to download {url}")

#         # Download and save each file
#         for file_url in files.values():
#             download_and_save(file_url, dest_dir)

#     def process(self) -> None:
#         self.data, self.slices, sizes = read_tu_data(self.raw_dir, self.name)
#         print(self.data.cos_sim)
#         if self.pre_filter is not None or self.pre_transform is not None:
#             data_list = [self.get(idx) for idx in range(len(self))]

#             if self.pre_filter is not None:
#                 data_list = [d for d in data_list if self.pre_filter(d)]

#             if self.pre_transform is not None:
#                 data_list = [self.pre_transform(d) for d in data_list]

#             self.data, self.slices = self.collate(data_list)
#             self._data_list = None  # Reset cache.

#         assert isinstance(self._data, Data)
#         fs.torch_save(
#             (self._data.to_dict(), self.slices, sizes, self._data.__class__),
#             self.processed_paths[0],
#         )

#     def __repr__(self) -> str:
#         return f'{self.name}({len(self)})'

In [11]:
from torch_geometric.datasets import TUDataset
data = TUDataset(root=f'/working/{DATASET}', name=f'{DATASET}')

Downloading https://www.chrsmrrs.com/graphkerneldatasets/NCI109.zip
Processing...
Done!


In [12]:
# import aiohttp  # Import aiohttp to access ClientConnectorError
# from requests.exceptions import SSLError

# try:
#     from torch_geometric.datasets import TUDataset
#     data = TUDataset(root='/working/NCI1', name='NCI1')
# except aiohttp.ClientConnectorSSLError as e:  # Catch connection errors specifically
#     print(f"Connection error occurred: {e}")
#     # Specify the folder where your dataset files are located
#     folder = '/kaggle/working/'
#     # Specify the prefix used in your dataset files
#     prefix = 'NCI1'
#     # Call the function assuming MyTUDataset is properly defined and imported
#     data = MyTUDataset(folder, prefix)
# except requests.exceptions.SSLError as e:
#     print(f"Connection error occurred: {e}")
#     # Specify the folder where your dataset files are located
#     folder = '/kaggle/working/'
#     # Specify the prefix used in your dataset files
#     prefix = 'NCI1'
#     # Call the function assuming MyTUDataset is properly defined and imported
#     data = MyTUDataset(folder, prefix)
# except Exception as e:  # Catch any other exceptions
#     print(f"An unexpected error occurred: {e}")
#     folder = '/kaggle/working/'
#     # Specify the prefix used in your dataset files
#     prefix = 'NCI1'
#     # Call the function assuming MyTUDataset is properly defined and imported
#     data = MyTUDataset(folder, prefix)

In [13]:

ID_dataset = IDAddingDataset(data, None)


In [14]:
ID_dataset[2]

Data(edge_index=[2, 32], x=[16, 38], y=[1], id=2, num_edges=32, adj_mat=[16, 16])

In [15]:
embeddings_dict = {}
for graph in ID_dataset:
    embeddings_dict[graph.id] = get_WL_embedding(graph, 3)


In [16]:
Embedding_dataset = EmbeddingAddingDataset(ID_dataset, embeddings_dict)

In [17]:
all_hashes, all_classes, mean_vector = get_mean_vectors(Embedding_dataset)


In [18]:
all_classes

{0, 1, 'all'}

In [19]:
cos_sims = {}
class_cos_sims = {_class: [] for _class in all_classes}
for graph in Embedding_dataset:
    cos_sims[graph.id] = get_cos_sim(graph, all_hashes, all_classes, mean_vector)
    class_cos_sims[graph.y.item()].append(cos_sims[graph.id]['class'])
    class_cos_sims['all'].append(cos_sims[graph.id]['all'])


In [20]:
class_cos_sims[0][:30]

[0.5853465584530863,
 0.5068019238049677,
 0.7339678442660369,
 0.5571225493548922,
 0.8686873381624867,
 0.5766082289683534,
 0.46177875321140516,
 0.7429263342252902,
 0.7739490692842468,
 0.6581520969526358,
 0.43287565319196136,
 0.595201332120336,
 0.7927690261360428,
 0.5420758231828203,
 0.7377277810012453,
 0.8455716277332197,
 0.7809851319167144,
 0.7248867603180328,
 0.7578647234552629,
 0.6211631832126006,
 0.5813039156159345,
 0.5326632439413594,
 0.7367305230848631,
 0.7842632530639515,
 0.5192667923418167,
 0.706978092838242,
 0.3055597577975961,
 0.7967321632233072,
 0.6085941712884149,
 0.6143583041510753]

In [21]:
all_z_scores = {}

for graph in Embedding_dataset:
    all_z_scores[graph.id] = calc_z_scores(graph, cos_sims, class_cos_sims)

In [22]:
# Create the final transform and dataset
Final_dataset = FinalAddingDataset(Embedding_dataset, cos_sims, all_z_scores)

In [23]:
num_node_dict = {_cls: {graph.id: graph.num_nodes for graph in Final_dataset if graph.y.item() == _cls or _cls == 'all'} for _cls in all_classes}
num_node_mean_dict = {_cls: sum(num_node_dict[_cls].values())/len(num_node_dict[_cls].values()) for _cls in all_classes}
node_diff_dict = {_cls: {graph.id: graph.num_nodes - num_node_mean_dict[_cls] for graph in Final_dataset if graph.y.item() == _cls or _cls == 'all'} for _cls in all_classes}
pos_diff_dict = {_cls: {} for _cls in all_classes}
neg_diff_dict = {_cls: {} for _cls in all_classes}
for _cls, _dict in node_diff_dict.items():
    for idx, node_diff in _dict.items():
        if node_diff > 0:
            pos_diff_dict[_cls][idx] = node_diff
        else:
            pos_diff_dict[_cls][idx] = node_diff
pos_num_node_dict = {_cls: len(pos_diff_dict[_cls]) for _cls in all_classes}
neg_num_node_dict = {_cls: len(neg_diff_dict[_cls]) for _cls in all_classes}
percentile_dict = {graph.id: {'all': 0, 'class': 0} for graph in Final_dataset}
for _cls, _dict in pos_diff_dict.items():
    for i, (idx, val) in enumerate(sorted(_dict.items(), key = lambda x: x[1])):
        if _cls != 'all':
            percentile_dict[idx]['class'] = i/pos_num_node_dict[_cls]
        else:
            percentile_dict[idx]['all'] = i/pos_num_node_dict[_cls]
for _cls, _dict in neg_diff_dict.items():
    for i, (idx, val) in enumerate(sorted(_dict.items(), key = lambda x: x[1], reverse=True)):
        if _cls != 'all':
            percentile_dict[idx]['class'] = i/neg_num_node_dict[_cls]
        else:
            percentile_dict[idx]['all'] = i/neg_num_node_dict[_cls]
    

                       

In [24]:
num_node_percentile_dict = percentile_dict
for x in range(20):
    print(num_node_percentile_dict[x])

{'all': 0.14489944269445118, 'class': 0.216796875}
{'all': 0.23285679670462806, 'class': 0.3232421875}
{'all': 0.06421129149503271, 'class': 0.10302734375}
{'all': 0.32832566028592197, 'class': 0.43994140625}
{'all': 0.9825539132541798, 'class': 0.99560546875}
{'all': 0.025199903077295856, 'class': 0.03857421875}
{'all': 0.2781681608916889, 'class': 0.37841796875}
{'all': 0.23309910346498666, 'class': 0.32373046875}
{'all': 0.27841046765204747, 'class': 0.37890625}
{'all': 0.18269929731039497, 'class': 0.26025390625}
{'all': 0.003876908165737824, 'class': 0.00732421875}
{'all': 0.01453840562151684, 'class': 0.02294921875}
{'all': 0.8265083595832323, 'class': 0.9130859375}
{'all': 0.18294160407075358, 'class': 0.2607421875}
{'all': 0.4623212987642355, 'class': 0.5869140625}
{'all': 0.6127937969469348, 'class': 0.73828125}
{'all': 0.3285679670462806, 'class': 0.4404296875}
{'all': 0.46256360552459413, 'class': 0.58740234375}
{'all': 0.2786527744124061, 'class': 0.37939453125}
{'all': 0.1

In [25]:
cos_sim_dict = {_cls: {} for _cls in all_classes}
for graph in Final_dataset:
    cos_sim_dict[graph.y.item()][graph.id] = graph.cos_sim['class']
    cos_sim_dict['all'][graph.id] = graph.cos_sim['all']
            

cat_len_dict = {_cls: len(cos_sim_dict[_cls].values()) for _cls in all_classes}
cos_sim_percentile_dict = {graph.id: {'all': 0, 'class': 0} for graph in Final_dataset}


for _cls, _dict in cos_sim_dict.items():
    for i, (idx, val) in enumerate(sorted(_dict.items(), key = lambda x: x[1])):
        if _cls != 'all':
            cos_sim_percentile_dict[idx]['class'] = i/cat_len_dict[_cls]
        else:
            cos_sim_percentile_dict[idx]['all'] = i/cat_len_dict[_cls]


In [26]:
for x in range(10):
    print(cos_sim_percentile_dict[x])

{'all': 0.41289071965107826, 'class': 0.33203125}
{'all': 0.21904531136418706, 'class': 0.1845703125}
{'all': 0.7673855100557305, 'class': 0.79150390625}
{'all': 0.2214683789677732, 'class': 0.26904296875}
{'all': 0.9975769323964139, 'class': 0.99853515625}
{'all': 0.34892173491640416, 'class': 0.30859375}
{'all': 0.11339956384783136, 'class': 0.138671875}
{'all': 0.7404894596559244, 'class': 0.8251953125}
{'all': 0.9023503755754786, 'class': 0.919921875}
{'all': 0.5061788223891447, 'class': 0.52685546875}


In [27]:
percentiles = [x for x in range(35, 91, 5)]
cats = ['class', 'all']
metrics = ['random', 'graph_order', 'cos_sim']
train_indices_dict = {cat: {metric: {percentile: [] for percentile in percentiles} for metric in metrics} for cat in cats}
for percentile in percentiles:
    for idx, cat_pairs in cos_sim_percentile_dict.items():
        for cat, val in cat_pairs.items():
            if val < 0.01*percentile:
                train_indices_dict[cat]['cos_sim'][percentile].append(idx)
    for idx, cat_pairs in num_node_percentile_dict.items():
        for cat, val in cat_pairs.items():
            if val < 0.01* percentile:
                train_indices_dict[cat]['graph_order'][percentile].append(idx)
    for _cls, id_pairs in num_node_dict.items():
        for cat in cats:
            size = int(percentile * 0.01 * len(id_pairs))
            train_indices_dict[cat]['random'][percentile] = np.random.choice(list(id_pairs.keys()), size, replace=False)
            
        
        

In [28]:
import pickle

# Specify the path to the file where you want to save the dataset
file_path = f'/kaggle/working/{DATASET}_train_indices_dict.pkl'

# Saving the dataset
with open(file_path, 'wb') as file:
    pickle.dump(train_indices_dict, file)

print("Dict saved successfully.")

Dict saved successfully.


In [29]:
# import matplotlib.pyplot as plt
# # for x in range(6):
# all_z_scores = [graph.num_nodes for graph in Final_dataset]
# num_node_mean = sum(all_z_scores)/600
# all_z_scores = [x - num_node_mean for x in all_z_scores]
# percentiles = [50, 95, 99]  # Change these values based on your requirements (xx%)
# percentile_values = np.percentile(all_z_scores, percentiles)

# plt.figure(figsize=(10, 6))
# plt.hist(all_z_scores, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of Z-Scores with Percentiles')
# plt.xlabel('Z-Score')
# plt.ylabel('Frequency')

# # Add vertical lines for each percentile
# for perc, value in zip(percentiles, percentile_values):
#     plt.axvline(x=value, color='r', linestyle='--', label=f'{perc}th percentile: {value:.2f}')

# plt.legend()
# plt.grid(True)
# plt.show()

In [30]:
import pickle

# Specify the path to the file where you want to save the dataset
file_path = f'/kaggle/working/{DATASET}.pt'

# Saving the dataset
with open(file_path, 'wb') as file:
    torch.save(Final_dataset, file)

print("Dataset saved successfully.")

Dataset saved successfully.


In [31]:
import os

# Define the directory you want to list
directory_path = '/kaggle/working/'

# List all files and directories in the specified path
contents = os.listdir(directory_path)

print("Contents of '/kaggle/working/':")
for item in contents:
    print(item)

Contents of '/kaggle/working/':
NCI109.pt
NCI109_train_indices_dict.pkl
__notebook__.ipynb
