In [1]:

!pip install torch torch-geometric
first_run = False
import os
import os.path as osp
from typing import Callable, Dict, List, Optional

import torch
import os
import os.path as osp
from typing import Callable, Dict, List, Optional
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
    extract_tar,
    extract_zip,
)


class MalNetTiny(InMemoryDataset):
    r"""The MalNet Tiny dataset from the
    `"A Large-Scale Database for Graph Representation Learning"
    <https://openreview.net/pdf?id=1xDTDk3XPW>`_ paper.
    :class:`MalNetTiny` contains 5,000 malicious and benign software function
    call graphs across 5 different types. Each graph contains at most 5k nodes.

    Args:
        root (str): Root directory where the dataset should be saved.
        split (str, optional): If :obj:`"train"`, loads the training dataset.
            If :obj:`"val"`, loads the validation dataset.
            If :obj:`"trainval"`, loads the training and validation dataset.
            If :obj:`"test"`, loads the test dataset.
            If :obj:`None`, loads the entire dataset.
            (default: :obj:`None`)
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
        pre_filter (callable, optional): A function that takes in an
            :obj:`torch_geometric.data.Data` object and returns a boolean
            value, indicating whether the data object should be included in the
            final dataset. (default: :obj:`None`)
        force_reload (bool, optional): Whether to re-process the dataset.
            (default: :obj:`False`)
    """
    data_url = ('http://malnet.cc.gatech.edu/'
                'graph-data/malnet-graphs-tiny.tar.gz')
    split_url = 'http://malnet.cc.gatech.edu/split-info/split_info_tiny.zip'
    splits = ['train', 'val', 'test']

    def __init__(
        self,
        root: str,
        split: Optional[str] = None,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
        pre_filter: Optional[Callable] = None,
        force_reload: bool = False,
    ) -> None:
        if split not in {'train', 'val', 'trainval', 'test', None}:
            raise ValueError(f'Split "{split}" found, but expected either '
                             f'"train", "val", "trainval", "test" or None')
        super().__init__(root, transform, pre_transform, pre_filter,
                         force_reload=force_reload)
        self.load(self.processed_paths[0])

        if split is not None:
            split_slices = torch.load(self.processed_paths[1])
            if split == 'train':
                self._indices = range(split_slices[0], split_slices[1])
            elif split == 'val':
                self._indices = range(split_slices[1], split_slices[2])
            elif split == 'trainval':
                self._indices = range(split_slices[0], split_slices[2])
            elif split == 'test':
                self._indices = range(split_slices[2], split_slices[3])

    @property
    def raw_file_names(self) -> List[str]:
        return ['malnet-graphs-tiny', osp.join('split_info_tiny', 'type')]

    @property
    def processed_file_names(self) -> List[str]:
        return ['data.pt', 'split_slices.pt']

    def download(self) -> None:
        path = download_url(self.data_url, self.raw_dir)
        extract_tar(path, self.raw_dir)
        os.unlink(path)

        path = download_url(self.split_url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        os.unlink(path)

    def process(self) -> None:
        y_map: Dict[str, int] = {}
        data_list = []
        split_slices = [0]

        for split in ['train', 'val', 'test']:
            with open(osp.join(self.raw_paths[1], f'{split}.txt'), 'r') as f:
                filenames = f.read().split('\n')[:-1]
                split_slices.append(split_slices[-1] + len(filenames))

            for filename in filenames:
                path = osp.join(self.raw_paths[0], f'{filename}.edgelist')
                malware_type = filename.split('/')[0]
                y = y_map.setdefault(malware_type, len(y_map))

                with open(path, 'r') as f:
                    edges = f.read().split('\n')[5:-1]

                edge_indices = [[int(s) for s in e.split()] for e in edges]
                edge_index = torch.tensor(edge_indices).t().contiguous()
                num_nodes = int(edge_index.max()) + 1
                data = Data(edge_index=edge_index, y=y, num_nodes=num_nodes)
                data_list.append(data)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        self.save(data_list, self.processed_paths[0])
        torch.save(split_slices, self.processed_paths[1])

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [2]:
from torch_geometric.data import DataLoader

# Instantiate the dataset class for the train split
dataset_root = '/kaggle/working/malnetTiny'  # Set the path where you want to download the data

all_dataset = MalNetTiny(root=dataset_root, split=None)
# Load the dataset with the train split
train_dataset = MalNetTiny(root=dataset_root, split='train')

# Load the validation and test splits as well if needed
val_dataset = MalNetTiny(root=dataset_root, split='val')
test_dataset = MalNetTiny(root=dataset_root, split='test')

# Create data loaders for each split (if needed)
all_loader = DataLoader(all_dataset, batch_size=32, shuffle = False)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Iterate through the dataset and print some basic information about the batches
for data in train_loader:
    print(f"Batch - Number of graphs: {data.num_graphs}")
    print(f"Batch - Edge index shape: {data.edge_index.shape}")
    print(f"Batch - Labels: {data.y}")
    break  # Only show the first batch


Downloading http://malnet.cc.gatech.edu/graph-data/malnet-graphs-tiny.tar.gz
Extracting /kaggle/working/malnetTiny/raw/malnet-graphs-tiny.tar.gz
Downloading http://malnet.cc.gatech.edu/split-info/split_info_tiny.zip
Extracting /kaggle/working/malnetTiny/raw/split_info_tiny.zip
Processing...
Done!


Batch - Number of graphs: 32
Batch - Edge index shape: torch.Size([2, 103175])
Batch - Labels: tensor([2, 3, 3, 2, 0, 3, 3, 0, 2, 3, 0, 4, 0, 0, 0, 0, 4, 4, 1, 1, 2, 0, 0, 0,
        2, 2, 1, 4, 0, 3, 3, 3])




In [3]:
import hashlib
import numpy as np
import torch
from torch_geometric.transforms import BaseTransform
from torch_geometric.utils import to_scipy_sparse_matrix
from torch_geometric.data import Data
from torch_geometric.io import fs, read_txt_array
import torch_geometric.transforms as T
from torch_geometric.data import Data, InMemoryDataset
import math
from typing import Callable, List, Optional, Tuple, Dict
from torch import Tensor
import os
import gc
import requests
from torch_geometric.utils import coalesce, cumsum, one_hot, remove_self_loops
from scipy.sparse import coo_array
from tqdm import tqdm

In [4]:
class IDAddingDataset:
    def __init__(self, dataset, attr_func):
        self.dataset = dataset
        self.attr_func = attr_func
        self.indices = np.arange(0, len(dataset.indices()))
    
    def __getitem__(self, idx):
        data = self.dataset[idx].clone()  # Clone the data to avoid modifying the original dataset
        data.id = self.indices[idx]
        data.num_edges = self.dataset[idx].edge_index.size(1)
        data.adj_mat = to_scipy_sparse_matrix(self.dataset[idx].edge_index, num_nodes=self.dataset[idx].num_nodes).toarray().astype(int)
        return data
    
    def __len__(self):
        return len(self.dataset)


In [5]:
class EmbeddingAddingDataset:
    def __init__(self, dataset, embedding_dict):
        self.dataset = dataset
        self.embedding_dict = embedding_dict
    
    def __getitem__(self, idx):
        data = self.dataset[idx].clone()  # Clone the data to avoid modifying the original dataset
        data.embedding = self.embedding_dict[self.dataset[idx].id]
        return data
    
    def __len__(self):
        return len(self.dataset)

In [6]:
class FinalAddingDataset:
    def __init__(self, Embedding_dataset, cos_sims, all_z_scores):
        self.dataset = Embedding_dataset
        self.cos_sims = cos_sims
        self.all_z_scores = all_z_scores
        
    
    def __getitem__(self, idx):
        data = self.dataset[idx].clone()  # Clone the data to avoid modifying the original dataset
        data.cos_sim = self.cos_sims[self.dataset[idx].id]
        data.z_score = self.all_z_scores[self.dataset[idx].id]
        return data
    
    def __len__(self):
        return len(self.dataset)

In [7]:
import numpy as np
import xxhash

def stable_hash(value):
    return xxhash.xxh64(str(value)).intdigest()

def get_WL_embedding(data, n_iter):
    graph = data.adj_mat
    graph_order = graph.shape[0]  # Number of nodes
    graph_hash_dict = {}  # Mapping of unique labels and counts

    # Initialize labels based on node degrees
    labels = np.sum(graph, axis=1)
    hashed_labels = np.array([stable_hash(label) for label in labels])

    # Precompute neighbor indices
    indices = [np.nonzero(graph[i])[0] for i in range(graph_order)]

    for _ in range(n_iter):
        # Concatenate node's hash with its sorted neighbor hashes
        hashes = np.array([
            stable_hash((hashed_labels[i],) + tuple(np.sort(hashed_labels[indices[i]])))
            for i in range(graph_order)
        ])

        # Update the graph hash dictionary with counts
        unique_hashes, counts = np.unique(hashes, return_counts=True)

        for h, c in zip(unique_hashes, counts):
            graph_hash_dict[h] = graph_hash_dict.get(h, 0) + c

        # Update labels for the next iteration
        hashed_labels = hashes

    return graph_hash_dict


In [8]:
def get_mean_vectors(dataset, all_hashes, all_classes):
    empty_array = np.zeros(dataset[0].embedding.todense().shape)
    mean_vector_dict = {_cls: empty_array for _cls in all_classes}
    samples_per_class_dict = {_cls:0 for _cls in all_classes}
    
    for graph in tqdm(dataset, desc='4/8 Calculating Mean vectors'):
        
        mean_vector_dict[int(graph.y.item())]+=graph.embedding.toarray()
        samples_per_class_dict[int(graph.y.item())]+=1
    for _cls in all_classes:
        mean_vector_dict[_cls] /= samples_per_class_dict[_cls]
    return mean_vector_dict

def get_cos_sim(dataset, mean_vectors):
    cos_sims, norm_dict = {}, {}
    
    # Precompute the norm for each class mean vector
    for _class in range(5):
        norm_dict[_class] = np.linalg.norm(mean_vectors[_class])  # L2 norm of the mean vector
    
    # Compute cosine similarity for each graph
    for graph in tqdm(dataset, desc='5/8 Calculating cosine similarities'):
        graph_embedding = graph.embedding.todense()[0]  # Get the graph's embedding as a dense vector
        graph_norm = np.linalg.norm(graph_embedding)  # L2 norm of the graph's embedding
        
        # Dot product between the graph's embedding and its class mean vector
        class_numerator = graph_embedding @ mean_vectors[graph.y.item()][0]
        
        # Calculate cosine similarity
        class_denom = norm_dict[graph.y.item()] * graph_norm
        cos_sims[graph.id] = class_numerator / class_denom if class_denom != 0 else 0.0
    
    return cos_sims


def calc_z_scores(dataset, cos_sims, class_cos_sims):
    # Ensure data types are numpy arrays for statistical computation
    z_scores = {}
    SDs = {_cls: np.std(class_cos_sims[_cls]) for _cls in class_cos_sims.keys()}
    for graph in tqdm(dataset, desc='7/8 Calculating z-scores'):
        cos_sim = cos_sims[graph.id]
        z_scores[graph.id]= cos_sim/SDs[graph.y.item()]
    return z_scores

In [9]:
def Process_dataset(dataset, prefix):
    ID_dataset = IDAddingDataset(dataset, None)
    embeddings_dict = {}
    all_hashes=set()
    all_classes = set()
    from tqdm import tqdm
    for graph in tqdm(ID_dataset, desc='1/8 Adding Embeddings'):
        embedding = get_WL_embedding(graph, 3)
        embeddings_dict[graph.id] = embedding
        all_hashes.update(embedding.keys())
        all_classes.update(graph.y)
    gc.collect()
    all_classes = set(range(5))
    all_hashes_new = list(all_hashes)
    empty_vector = np.zeros(len(all_hashes_new))
    index_dict = {all_hashes_new[i]: i for i in range(len(all_hashes_new))}
    new_embedding_dict = {}
    sum_vector = empty_vector.copy()
    indices_to_delete = []

    # Create coo_arrays as before and calculate sum_vector
    for graph in tqdm(ID_dataset, desc='2/8 Standardizing Vectors'):
        this_vector = empty_vector.copy()
        for key, value in embeddings_dict[graph.id].items():
            this_vector[index_dict[key]] = value
            sum_vector[index_dict[key]] += value
        new_embedding_dict[graph.id] = coo_array(this_vector)

    gc.collect()

    # Identify indices to delete
    for i, val in enumerate(sum_vector):
        if val < 7:  # If this value is present in less than 0.2% of samples
            indices_to_delete.append(i)

    # Function to filter out the indices from coo_array
    def filter_coo_array(sparse_matrix, indices_to_delete):
        vector = sparse_matrix.todense()[0]
        vector = np.delete(vector, indices_to_delete)
        return coo_array(vector)


    # Recreate new_embedding_dict with filtered indices
    filtered_embedding_dict = {}
    for graph_id, sparse_matrix in tqdm(new_embedding_dict.items(), desc='3/8 Filtering out rare hashes.'):
        filtered_embedding_dict[graph_id] = filter_coo_array(sparse_matrix, indices_to_delete)

    gc.collect()
    Embedding_dataset = EmbeddingAddingDataset(ID_dataset, filtered_embedding_dict)
    mean_vector = get_mean_vectors(Embedding_dataset, all_hashes, all_classes)
    
    cos_sims = get_cos_sim(Embedding_dataset, mean_vector)
    class_cos_sims = {_class: [] for _class in all_classes}
    for graph in tqdm(Embedding_dataset, desc='6/8 grouping cosine sims by category'):
        class_cos_sims[graph.y.item()].append(cos_sims[graph.id])
        
    z_scores = calc_z_scores(Embedding_dataset, cos_sims, class_cos_sims)

    class_z_scores = {i: [] for i in range(5)}
    Final_dataset = FinalAddingDataset(Embedding_dataset, cos_sims, z_scores)
    for graph in tqdm(Final_dataset, desc='8/8 grouping z-scores by category'):
        class_z_scores[graph.y.item()].append(graph.z_score)
    class_z_scores = {cat: np.array(scores) for cat, scores in class_z_scores.items()}
    
    if not os.path.exists(f'/kaggle/working/{prefix}'):
        print(f"Creating directory: /kaggle/working/{prefix}")
        os.mkdir(f'/kaggle/working/{prefix}')
    else:
        print(f"Directory already exists: /kaggle/working/{prefix}")


        # Saving the dataset
        with open(f'/kaggle/working/{prefix}/malnetProcessed.pt', 'wb') as file:
            torch.save(Final_dataset, file)

        with open(f'/kaggle/working/{prefix}/classCossims.pt', 'wb') as file:
            torch.save(class_cos_sims, file)

        with open(f'/kaggle/working/{prefix}/datasetCossims.pt', 'wb') as file:
            torch.save(cos_sims, file)

        with open(f'/kaggle/working/{prefix}/zScores.pt', 'wb') as file:
            torch.save(z_scores, file)

        with open(f'/kaggle/working/{prefix}/ClassZScores.pt', 'wb') as file:
            torch.save(class_z_scores, file)

        with open(f'/kaggle/working/{prefix}/meanVectors.pt', 'wb') as file:
            torch.save(mean_vector, file)

In [10]:
Process_dataset(all_dataset, 'all')

1/8 Adding Embeddings: 100%|██████████| 5000/5000 [06:12<00:00, 13.43it/s]
2/8 Standardizing Vectors: 100%|██████████| 5000/5000 [02:03<00:00, 40.35it/s]
3/8 Filtering out rare hashes.: 100%|██████████| 5000/5000 [01:39<00:00, 50.29it/s]
4/8 Calculating Mean vectors: 100%|██████████| 5000/5000 [03:29<00:00, 23.88it/s]
5/8 Calculating cosine similarities: 100%|██████████| 5000/5000 [04:44<00:00, 17.58it/s]
6/8 grouping cosine sims by category: 100%|██████████| 5000/5000 [03:27<00:00, 24.11it/s]
7/8 Calculating z-scores: 100%|██████████| 5000/5000 [03:27<00:00, 24.12it/s]
8/8 grouping z-scores by category: 100%|██████████| 5000/5000 [10:24<00:00,  8.01it/s]

Creating directory: /kaggle/working/all





In [11]:
Process_dataset(train_dataset, 'train')


1/8 Adding Embeddings: 100%|██████████| 3500/3500 [04:14<00:00, 13.73it/s]
2/8 Standardizing Vectors: 100%|██████████| 3500/3500 [01:23<00:00, 42.10it/s]
3/8 Filtering out rare hashes.: 100%|██████████| 3500/3500 [01:08<00:00, 50.88it/s]
4/8 Calculating Mean vectors: 100%|██████████| 3500/3500 [02:22<00:00, 24.49it/s]
5/8 Calculating cosine similarities: 100%|██████████| 3500/3500 [03:21<00:00, 17.34it/s]
6/8 grouping cosine sims by category: 100%|██████████| 3500/3500 [02:23<00:00, 24.42it/s]
7/8 Calculating z-scores: 100%|██████████| 3500/3500 [02:24<00:00, 24.27it/s]
8/8 grouping z-scores by category: 100%|██████████| 3500/3500 [07:22<00:00,  7.91it/s]


Creating directory: /kaggle/working/train


In [12]:
Process_dataset(test_dataset, 'test')


1/8 Adding Embeddings: 100%|██████████| 1000/1000 [01:18<00:00, 12.77it/s]
2/8 Standardizing Vectors: 100%|██████████| 1000/1000 [00:24<00:00, 41.17it/s]
3/8 Filtering out rare hashes.: 100%|██████████| 1000/1000 [00:09<00:00, 101.29it/s]
4/8 Calculating Mean vectors: 100%|██████████| 1000/1000 [00:42<00:00, 23.36it/s]
5/8 Calculating cosine similarities: 100%|██████████| 1000/1000 [00:57<00:00, 17.43it/s]
6/8 grouping cosine sims by category: 100%|██████████| 1000/1000 [00:42<00:00, 23.43it/s]
7/8 Calculating z-scores: 100%|██████████| 1000/1000 [00:42<00:00, 23.42it/s]
8/8 grouping z-scores by category: 100%|██████████| 1000/1000 [02:25<00:00,  6.88it/s]

Creating directory: /kaggle/working/test





In [13]:
Process_dataset(val_dataset, 'val')

1/8 Adding Embeddings: 100%|██████████| 500/500 [00:38<00:00, 13.00it/s]
2/8 Standardizing Vectors: 100%|██████████| 500/500 [00:11<00:00, 41.90it/s]
3/8 Filtering out rare hashes.: 100%|██████████| 500/500 [00:02<00:00, 177.36it/s]
4/8 Calculating Mean vectors: 100%|██████████| 500/500 [00:21<00:00, 23.74it/s]
5/8 Calculating cosine similarities: 100%|██████████| 500/500 [00:20<00:00, 24.21it/s]
6/8 grouping cosine sims by category: 100%|██████████| 500/500 [00:20<00:00, 24.26it/s]
7/8 Calculating z-scores: 100%|██████████| 500/500 [00:20<00:00, 24.03it/s]
8/8 grouping z-scores by category: 100%|██████████| 500/500 [01:03<00:00,  7.89it/s]

Creating directory: /kaggle/working/val





In [14]:
# print([x for x in os.scandir('/kaggle/working/')])

In [15]:

# ID_dataset = IDAddingDataset(train_dataset, None)


In [16]:
# import gc
# del data
# gc.collect()

In [17]:
# # import networkx as nx
# G = nx.from_numpy_array(ID_dataset[1].adj_mat)

In [18]:
# !pip install graphistry
# # import graphistry

In [19]:
# graphistry.register(api=3, username='lukemiller1987', password='yhwggP56C9tm!WF', protocol='https', server='hub.graphistry.com')

In [20]:
# graphistry.bind(source='src', destination='dst', node='nodeid').plot(G)

In [21]:
# embeddings_dict = {}
# all_hashes=set()
# all_classes = set()
# from tqdm import tqdm
# for graph in tqdm(ID_dataset, desc='1/ Calculating Embeddings'):
#     embedding = get_WL_embedding(graph, 3)
#     embeddings_dict[graph.id] = embedding
#     all_hashes.update(embedding.keys())
#     all_classes.update(graph.y)
# gc.collect()


In [22]:
# all_classes = set(range(5))

In [23]:
# from scipy.sparse import coo_array
# import numpy as np
# from tqdm import tqdm

# all_hashes_new = list(all_hashes)
# empty_vector = np.zeros(len(all_hashes_new))
# index_dict = {all_hashes_new[i]: i for i in range(len(all_hashes_new))}
# new_embedding_dict = {}
# sum_vector = empty_vector.copy()
# indices_to_delete = []

# # Create coo_arrays as before and calculate sum_vector
# for graph in tqdm(ID_dataset):
#     this_vector = empty_vector.copy()
#     for key, value in embeddings_dict[graph.id].items():
#         this_vector[index_dict[key]] = value
#         sum_vector[index_dict[key]] += value
#     new_embedding_dict[graph.id] = coo_array(this_vector)

# gc.collect()

# # Identify indices to delete
# for i, val in enumerate(sum_vector):
#     if val < 7:  # If this value is present in less than 0.2% of samples
#         indices_to_delete.append(i)

# # Function to filter out the indices from coo_array
# def filter_coo_array(sparse_matrix, indices_to_delete):
#     vector = sparse_matrix.todense()[0]
#     vector = np.delete(vector, indices_to_delete)
#     return coo_array(vector)


# # Recreate new_embedding_dict with filtered indices
# filtered_embedding_dict = {}
# for graph_id, sparse_matrix in tqdm(new_embedding_dict.items()):
#     filtered_embedding_dict[graph_id] = filter_coo_array(sparse_matrix, indices_to_delete)

# gc.collect()


In [24]:
# Embedding_dataset = EmbeddingAddingDataset(ID_dataset, filtered_embedding_dict)

In [25]:
# mean_vector = get_mean_vectors(Embedding_dataset, all_hashes, all_classes)


In [26]:
# cos_sims = get_cos_sim(Embedding_dataset, mean_vector)
# class_cos_sims = {_class: [] for _class in all_classes}
# for graph in tqdm(Embedding_dataset):
# #     class_cos_sims[graph.y.item()].append(cos_sims[graph.id])


In [27]:
# z_scores = calc_z_scores(Embedding_dataset, cos_sims, class_cos_sims)


Left off here.

In [28]:
# Create the final transform and dataset
# Final_dataset = FinalAddingDataset(Embedding_dataset, cos_sims, z_scores)

In [29]:
# class_z_scores = {i: [] for i in range(5)}
# for graph in tqdm(Final_dataset):
#     class_z_scores[graph.y.item()].append(graph.z_score)
# class_z_scores = {cat: np.array(scores) for cat, scores in class_z_scores.items()}

In [30]:
# import pickle

# # Saving the dataset
# with open('/kaggle/working/malnetProcessed.pt', 'wb') as file:
#     torch.save(Final_dataset, file)
    
# with open('/kaggle/working/classCossims.pt', 'wb') as file:
#     torch.save(class_cos_sims, file)
    
# with open('/kaggle/working/datasetCossims.pt', 'wb') as file:
#     torch.save(cos_sims, file)
    
# with open('/kaggle/working/zScores.pt', 'wb') as file:
#     torch.save(z_scores, file)
    
# with open('/kaggle/working/ClassZScores.pt', 'wb') as file:
#     torch.save(class_z_scores, file)
    
# with open('/kaggle/working/meanVectors.pt', 'wb') as file:
#     torch.save(mean_vector, file)

# print("Dataset saved successfully.")

In [31]:
# import sys
# sys.exit()

In [32]:
# type(Final_dataset[0].embedding.todense())

In [33]:
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd

# # Assuming Final_dataset is a list or dictionary containing your data
# # Extract embeddings and labels for visualization

# # Step 1: Extract embeddings and labels
# embeddings = {x: np.zeros(Final_dataset[0].embedding.todense()[0].shape) for x in range(5)}
# counts = {x: 0 for x in range(5)}
# for graph in tqdm(Final_dataset):
#     embeddings[graph.y.item()] += graph.embedding.todense()[0]
#     counts[graph.y.item()] += 1
# df_category_means = pd.DataFrame(embeddings).T

# # Step 3: Plot heatmap of the prevalence of each dimension across categories


In [34]:
# counts

In [35]:
# for cat, vec in embeddings.items():
#     print(f'Category {cat}, mean {np.mean(vec)}, max {np.max(vec)}')

In [36]:
# for k, v in embeddings.items():
#     embeddings[k] = v/counts[k]

In [37]:
# for cat, vec in embeddings.items():
#     print(f'Category {cat}, mean {np.mean(vec)}, max {np.max(vec)}')

In [38]:
# embeddings[0].shape

In [39]:
# total_mean = np.zeros(embeddings[0].shape)
# for cat, vec in embeddings.items():
#     total_mean += vec
# total_mean /= 5
    

In [40]:
# difference_embeddings= {}
# for cat in range(5):
#     difference_embeddings[cat] = embeddings[cat]-total_mean
# df_diff_means = pd.DataFrame(embeddings).T

In [41]:
# from matplotlib.colors import LogNorm

# plt.figure(figsize=(20, 5))
# # LogNorm is used for logarithmic color scaling
# sns.heatmap(df_diff_means, cmap='hsv', norm=LogNorm(vmin=1e-6, vmax=500), annot=False, cbar=True, cbar_kws={"label": "Mean prevalence by dimension"})
# plt.title("Mean Embedding Vectors by Category", fontsize=30)
# plt.xlabel("Embedding Dimension", fontsize=16)
# plt.ylabel("Classification Category", fontsize=16)
# plt.yticks(rotation=0)
# plt.show()

In [42]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler, RobustScaler


# # Scaling options:
# # Option 1: Z-Score Normalization (StandardScaler)
# scaler = StandardScaler()

# # Option 2: Robust Scaling (based on IQR to handle outliers)
# # scaler = RobustScaler()

# # Normalize the embeddings by category (per row) using the chosen scaler
# df_normalized = pd.DataFrame(scaler.fit_transform(df_category_means.T).T, columns=df_category_means.columns)

# # Plotting each category's spectrum in separate plots, ignoring zero values
# for category in df_normalized.index:
#     plt.figure(figsize=(10, 6))
    
#     # Get the spectrum values and remove zero values
#     spectrum_values = df_normalized.loc[category]
#     not_small_indices = spectrum_values > 0.3
#     not_big_indices = spectrum_values < 50
#     just_right_indices = not_small_indices & not_big_indices
#     spectrum_values_non_zero = spectrum_values[just_right_indices]
#     dimensions_non_zero = df_normalized.columns[just_right_indices]
    
#     # Plot the spectrum for this category
#     plt.plot(dimensions_non_zero, spectrum_values_non_zero, label=f'Category {category}', linewidth=2)
    
#     # Add labels and title
#     plt.xlabel('Embedding Dimensions')
#     plt.ylabel('Scaled Value')
#     plt.title(f'Scaled Spectrum for Category {category}')
#     plt.grid(True)
    
#     # Show the plot
#     plt.tight_layout()
#     plt.show()


In [43]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.preprocessing import MinMaxScaler

# # Min-Max scaling for each category individually
# scalers = {}
# df_scaled = pd.DataFrame()

# # Apply Min-Max scaling for each category
# for category in df_category_means.index:
#     scaler = MinMaxScaler()
#     scaled_values = scaler.fit_transform(df_category_means.loc[category].values.reshape(-1, 1)).flatten()
#     df_scaled[category] = scaled_values

# df_scaled = df_scaled.T  # Transpose back so categories are rows

# # Plotting all categories on the same chart with different colors
# plt.figure(figsize=(12, 5))

# # Colors for each category
# colors = plt.cm.hsv(np.linspace(0, 0.8, len(df_scaled.index)))

# # Plot each category's spectrum as a line
# for idx, category in enumerate(df_scaled.index):
#     # Get the spectrum values and remove zero values
#     spectrum_values = df_scaled.loc[category]
#     non_zero_indices = spectrum_values > 0.001
#     too_big_indices = spectrum_values < 0.8
#     my_indices = non_zero_indices & too_big_indices
#     spectrum_values_non_zero = spectrum_values[my_indices]
#     dimensions_non_zero = df_scaled.columns[my_indices]
    
#     # Plot the spectrum for this category with a unique color
#     plt.plot(dimensions_non_zero, spectrum_values_non_zero, color=colors[idx], label=f'Category {category}', linewidth=1)

# # Add labels and title
# plt.xlabel('Embedding Dimensions')
# plt.ylabel('Min-Max Scaled Value')
# plt.title('Scaled Spectra of Mean Embeddings by Category')
# plt.grid(True)
# plt.yscale('log')

# # Show the legend
# plt.legend(title='Category')

# # Show the plot
# plt.tight_layout()
# plt.show()


In [44]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd

# # Step 1: Extract embeddings and labels
# # This will use the 'embeddings' and 'labels' you already have in your dataset
# X = np.array(embeddings.values())  # X has shape (600, 27816)
# y = np.array(embeddings.keys())  # Labels for each embedding

# # Step 2: Apply t-SNE to reduce dimensions
# tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000, metric='cosine')
# X_tsne = tsne.fit_transform(X)

# # Step 3: Create a DataFrame with the results
# df_tsne = pd.DataFrame(X_tsne, columns=['t-SNE 1', 't-SNE 2'])
# df_tsne['label'] = y

# # Step 4: Plot the t-SNE results using seaborn
# plt.figure(figsize=(12, 8))
# sns.scatterplot(x='t-SNE 1', y='t-SNE 2', hue='label', palette='tab10', data=df_tsne, s=60)
# plt.title("t-SNE Visualization of Embeddings", fontsize=16)
# plt.xlabel("t-SNE 1", fontsize=12)
# plt.ylabel("t-SNE 2", fontsize=12)
# plt.legend(title="Category", loc='best')
# plt.show()


In [45]:
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd

# # Assuming embeddings are already extracted in `X` (shape: 600 embeddings, 27816 dimensions)
# # We'll calculate the correlation matrix across all embedding dimensions

# # Step 1: Convert embeddings to DataFrame for easier manipulation
# df_embeddings = pd.DataFrame(X)  # X has shape (600, 27816), where each column is a dimension

# # Step 2: Calculate the correlation matrix
# corr_matrix = df_embeddings.corr()

# # Step 3: Plot the correlation matrix using seaborn heatmap
# plt.figure(figsize=(12, 10))
# sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, linewidths=0.5)
# plt.title("Correlation Matrix of Embedding Dimensions", fontsize=16)
# plt.xlabel("Embedding Dimension", fontsize=12)
# plt.ylabel("Embedding Dimension", fontsize=12)
# plt.show()


In [46]:
# cos_sims = {i: [] for i in range(5)}
# for graph in tqdm(Final_dataset):
#     cos_sims[graph.y.item()].append(graph.cos_sim)
    
# cos_sims = {i: np.array(cos_sims[i]) for i in cos_sims.keys()}



In [47]:
# import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm

# num_buckets = 40  # 100 bins for increments of 0.01
# bin_edges = np.linspace(0, 1, num_buckets + 1)  # Create bin edges from 0 to 1 in increments of 0.01

# # Create a color map for different categories
# colors = cm.viridis(np.linspace(0, 1, 5))  # Assuming 5 categories

# plt.figure(figsize=(12, 5))

# # Loop through each category and plot their fill-between curve
# for i in range(5):
#     # Get the cosine similarity values for the current category
#     cos_sim_values = np.array(cos_sims[i])
    
#     # Use np.histogram to calculate the counts in each bucket (bin)
#     counts, _ = np.histogram(cos_sim_values, bins=bin_edges)
    
#     # The midpoint of each bin for plotting on the x-axis
#     bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    
#     # Use plt.fill_between to create smooth curves for each category
#     plt.fill_between(bin_centers, counts, color=colors[i], alpha=0.8, label=f'Category {i}')
    
#     # Plot smooth line over the fill_between
#     plt.plot(bin_centers, counts, color=colors[i], linewidth=2, alpha=0.8)

# # Customize the appearance of the plot
# plt.xlabel('Cosine Similarity', fontsize=16, color='white')
# plt.ylabel('Count', fontsize=16, color='white')
# plt.title('Cosine Similarity Distribution for All Categories', fontsize=20, color='white')

# # Set a dark background and gridlines for a more visually engaging look
# plt.style.use('dark_background')
# plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.8)

# # Set the x-axis limits and style
# plt.xlim(0.13, 1)

# # Show the legend to differentiate the categories
# plt.legend(title='Category', fontsize=12)
# plt.yscale('log')
# # Show the plot
# plt.tight_layout()
# plt.show()


In [48]:
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd

# # Assuming `Final_dataset` contains distribution scores for each embedding

# # Step 1: Extract distribution scores and labels
# distribution_scores = []
# labels = []

# for data in Final_dataset:
#     score = data.z_score  # Assuming 'z_score' stores the distribution score for each embedding
#     label = int(data.y.item())  # Extract the class label
    
#     distribution_scores.append(score)
#     labels.append(label)

# # Convert to numpy arrays for easier manipulation
# distribution_scores = np.array(distribution_scores)
# labels = np.array(labels)

# # Step 2: Create a DataFrame for easier plotting
# df_scores = pd.DataFrame({
#     'distribution_score': distribution_scores,
#     'label': labels
# })

# # Step 3: Plot KDE for each category
# plt.figure(figsize=(10, 6))
# sns.kdeplot(data=df_scores, x='distribution_score', hue='label', fill=True, palette='tab10')
# plt.title("KDE Plot of Distribution Scores by Category", fontsize=16)
# plt.xlabel("Distribution Score", fontsize=12)
# plt.ylabel("Density", fontsize=12)
# plt.legend(title="Category")
# plt.show()


In [49]:
# # Step 3: Define a list of colormaps for each category
# colormaps = ['Reds', 'Blues', 'Greens', 'Purples', 'Oranges', 'YlOrRd']  # List of colormaps
# fig, ax = plt.subplots(figsize=(25, 10))

# # Plot each category's heatmap on the same figure, one after the other
# num_categories = len(unique_labels)
# step = df_category_means.shape[0] // num_categories  # Calculate the step for splitting the x-axis
# for i, label in enumerate(unique_labels):
#     # Define the range of columns for this category
#     category_range = range(i * step, (i + 1) * step)
#     # Select the subset of the DataFrame for this category
#     data_subset = df_category_means.iloc[:, [i]]
#     # Plot the heatmap for the current category using a different colormap
#     sns.heatmap(data_subset.T, cmap=colormaps[i % len(colormaps)], norm=LogNorm(vmin=1e-4, vmax=np.max(df_category_means.values)),
#                 annot=False, cbar=(i == num_categories - 1), ax=ax,
#                 yticklabels=False, xticklabels=False, cbar_kws={'orientation': 'vertical'})

# # Customize the overall plot
# ax.set_title("Heatmap with Different Colormaps for Each Category", fontsize=16)
# ax.set_xlabel("Embedding Dimension", fontsize=12)
# ax.set_ylabel("Classification Category", fontsize=12)
# plt.xticks(rotation=45)
# plt.show()

In [50]:
# Final_dataset[8]
# high_graph = nx.from_numpy_array(Final_dataset[8].adj_mat)
# nx.draw_kamada_kawai(high_graph)

In [51]:
# low_embedding = Final_dataset[2].embedding[0]
# high_embedding = Final_dataset[8].embedding[0]

In [52]:
# set(low_embedding.keys()) & set(high_embedding.keys())

In [53]:
# sorted(high_embedding.keys())

In [54]:
# len(high_embedding)

In [55]:
# len(low_embedding and high_embedding)

In [56]:
# len(low_embedding.symmetric_difference(high_embedding))

In [57]:
# common_embeddings = set(low_embedding.keys()) and set(high_embedding.keys())
# both_embeddings = set(low_embedding.keys()) or set(high_embedding.keys())

In [58]:
# set(low_embedding.keys()).symmetric_difference(set(high_embedding.keys()))

In [59]:
# both_embeddings

In [60]:
# set(both_embeddings - common_embeddings)

In [61]:
# num_node_dict = {_cls: {graph.id: graph.num_nodes for graph in Final_dataset if graph.y.item() == _cls or _cls == 'all'} for _cls in all_classes}
# num_node_mean_dict = {_cls: sum(num_node_dict[_cls].values())/len(num_node_dict[_cls].values()) for _cls in all_classes}
# node_diff_dict = {_cls: {graph.id: graph.num_nodes - num_node_mean_dict[_cls] for graph in Final_dataset if graph.y.item() == _cls or _cls == 'all'} for _cls in all_classes}
# pos_diff_dict = {_cls: {} for _cls in all_classes}
# neg_diff_dict = {_cls: {} for _cls in all_classes}
# for _cls, _dict in node_diff_dict.items():
#     for idx, node_diff in _dict.items():
#         if node_diff > 0:
#             pos_diff_dict[_cls][idx] = node_diff
#         else:
#             pos_diff_dict[_cls][idx] = node_diff
# pos_num_node_dict = {_cls: len(pos_diff_dict[_cls]) for _cls in all_classes}
# neg_num_node_dict = {_cls: len(neg_diff_dict[_cls]) for _cls in all_classes}
# percentile_dict = {graph.id: {'all': 0, 'class': 0} for graph in Final_dataset}
# for _cls, _dict in pos_diff_dict.items():
#     for i, (idx, val) in enumerate(sorted(_dict.items(), key = lambda x: x[1])):
#         if _cls != 'all':
#             percentile_dict[idx]['class'] = i/pos_num_node_dict[_cls]
#         else:
#             percentile_dict[idx]['all'] = i/pos_num_node_dict[_cls]
# for _cls, _dict in neg_diff_dict.items():
#     for i, (idx, val) in enumerate(sorted(_dict.items(), key = lambda x: x[1], reverse=True)):
#         if _cls != 'all':
#             percentile_dict[idx]['class'] = i/neg_num_node_dict[_cls]
#         else:
#             percentile_dict[idx]['all'] = i/neg_num_node_dict[_cls]
    

                       

In [62]:
# num_node_percentile_dict = percentile_dict
# for x in range(20):
#     print(num_node_percentile_dict[x])

In [63]:
# cos_sim_dict = {_cls: {} for _cls in all_classes}
# for graph in Final_dataset:
#     cos_sim_dict[graph.y.item()][graph.id] = graph.cos_sim['class']
#     cos_sim_dict['all'][graph.id] = graph.cos_sim['all']
            

# cat_len_dict = {_cls: len(cos_sim_dict[_cls].values()) for _cls in all_classes}
# cos_sim_percentile_dict = {graph.id: {'all': 0, 'class': 0} for graph in Final_dataset}


# for _cls, _dict in cos_sim_dict.items():
#     for i, (idx, val) in enumerate(sorted(_dict.items(), key = lambda x: x[1])):
#         if _cls != 'all':
#             cos_sim_percentile_dict[idx]['class'] = i/cat_len_dict[_cls]
#         else:
#             cos_sim_percentile_dict[idx]['all'] = i/cat_len_dict[_cls]


In [64]:
# for x in range(10):
#     print(cos_sim_percentile_dict[x])

In [65]:
# percentiles = [x for x in range(35, 91, 5)]
# cats = ['class', 'all']
# metrics = ['random', 'graph_order', 'cos_sim']
# train_indices_dict = {cat: {metric: {percentile: [] for percentile in percentiles} for metric in metrics} for cat in cats}
# for percentile in percentiles:
#     for idx, cat_pairs in cos_sim_percentile_dict.items():
#         for cat, val in cat_pairs.items():
#             if val < 0.01*percentile:
#                 train_indices_dict[cat]['cos_sim'][percentile].append(idx)
#     for idx, cat_pairs in num_node_percentile_dict.items():
#         for cat, val in cat_pairs.items():
#             if val < 0.01* percentile:
#                 train_indices_dict[cat]['graph_order'][percentile].append(idx)
#     for _cls, id_pairs in num_node_dict.items():
#         for cat in cats:
#             size = int(percentile * 0.01 * len(id_pairs))
#             train_indices_dict[cat]['random'][percentile] = np.random.choice(list(id_pairs.keys()), size, replace=False)
            
        
        

In [66]:
# import pickle

# # Specify the path to the file where you want to save the dataset
# file_path = f'/kaggle/working/{DATASET}_train_indices_dict.pkl'

# # Saving the dataset
# with open(file_path, 'wb') as file:
#     pickle.dump(train_indices_dict, file)

# print("Dict saved successfully.")

In [67]:
# import matplotlib.pyplot as plt
# # for x in range(6):
# all_z_scores = [graph.num_nodes for graph in Final_dataset]
# num_node_mean = sum(all_z_scores)/600
# all_z_scores = [x - num_node_mean for x in all_z_scores]
# percentiles = [50, 95, 99]  # Change these values based on your requirements (xx%)
# percentile_values = np.percentile(all_z_scores, percentiles)

# plt.figure(figsize=(10, 6))
# plt.hist(all_z_scores, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of Z-Scores with Percentiles')
# plt.xlabel('Z-Score')
# plt.ylabel('Frequency')

# # Add vertical lines for each percentile
# for perc, value in zip(percentiles, percentile_values):
#     plt.axvline(x=value, color='r', linestyle='--', label=f'{perc}th percentile: {value:.2f}')

# plt.legend()
# plt.grid(True)
# plt.show()

In [68]:
# import pickle

# # Specify the path to the file where you want to save the dataset
# file_path = f'/kaggle/working/{DATASET}.pt'

# # Saving the dataset
# with open(file_path, 'wb') as file:
#     torch.save(Final_dataset, file)

# print("Dataset saved successfully.")

In [69]:
# import os

# # Define the directory you want to list
# directory_path = '/kaggle/working/'

# # List all files and directories in the specified path
# contents = os.listdir(directory_path)

# print("Contents of '/kaggle/working/':")
# for item in contents:
#     print(item)