In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import pickle
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import h5py
from scipy.sparse import dok_matrix
import gc
from scipy.spatial.distance import cosine
from numpy.linalg import norm

from scipy.sparse import csr_matrix
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ppa-dataset/__results__.html
/kaggle/input/ppa-dataset/graphidx2speciesid.csv
/kaggle/input/ppa-dataset/valid.csv
/kaggle/input/ppa-dataset/graph_data.hdf5
/kaggle/input/ppa-dataset/__notebook__.ipynb
/kaggle/input/ppa-dataset/__output__.json
/kaggle/input/ppa-dataset/train.csv
/kaggle/input/ppa-dataset/test.csv
/kaggle/input/ppa-dataset/custom.css


In [2]:
import h5py
from tqdm import tqdm
import numpy as np
import pickle

file_path = '/kaggle/input/ppa-dataset/graph_data.hdf5'
all_hashes = set()

with h5py.File(file_path, 'r') as f:
    for graph_name in tqdm(f.keys()):  # Iterate over the names of the graphs
        embedding_hashes = f[graph_name]['embedding'][:]  # Access embeddings using graph name
        # Extract all hash bytes at once (assuming hash is the first field in the dtype)
        hashes = embedding_hashes['hash']
        # Update the set with all hashes from this graph using set union
        all_hashes.update(hashes)

100%|██████████| 158100/158100 [12:40<00:00, 207.86it/s]


In [3]:
import gc
gc.collect()

0

In [4]:
all_hashes_list = np.array(sorted(all_hashes))

# Save the sorted list of all hashes using h5py
with h5py.File('/kaggle/working/all_hashes.h5', 'w') as h5f:
    # Create a dataset from our list/array of hashes
    h5f.create_dataset('all_hashes', data=all_hashes_list)

In [5]:
# Assuming 'all_hashes' is defined and is a set of byte strings
hash_to_idx = {hash_: idx for idx, hash_ in enumerate(all_hashes)}

# Define a custom HDF5 datatype for the hash-index pairs
dt = np.dtype([('hash', 'S32'), ('index', np.int32)])  # Adjust 'S32' based on your hash length

# Convert the hash_to_idx dictionary to a structured array
data = np.array(list(hash_to_idx.items()), dtype=dt)
with h5py.File('/kaggle/working/hash_to_idx.h5', 'w') as f:
    # Create a dataset from the structured array
    f.create_dataset('hash_to_idx', data=data)
del data
gc.collect()

0

In [6]:
from scipy.sparse import dok_matrix

num_hashes = len(all_hashes)
with h5py.File(file_path, 'r') as f:
    num_graphs = len(f.keys())
    matrix = dok_matrix((num_graphs, num_hashes))
    for i, graph_name in tqdm(enumerate(f.keys())):
        embedding_hashes = f[graph_name]['embedding'][:]
        for embedding_hash in embedding_hashes:
            hash_bytes, count = embedding_hash
            matrix[i, hash_to_idx[hash_bytes]] = count

del hash_to_idx
gc.collect()

158100it [45:23, 58.04it/s]


0

In [7]:
# Assuming matrix_csr is already defined
matrix_csr = csr_matrix(matrix)
del matrix
gc.collect()
with h5py.File('/kaggle/working/matrix_csr.h5', 'w') as f:
    f.create_dataset('data', data=matrix_csr.data)
    f.create_dataset('indices', data=matrix_csr.indices)
    f.create_dataset('indptr', data=matrix_csr.indptr)
    f.create_dataset('shape', data=matrix_csr.shape)

In [8]:
# Calculate mean and variance vectors
mean_vector = matrix_csr.mean(axis=0)
variance_vector = matrix_csr.power(2).mean(axis=0) - np.square(mean_vector)
del matrix_csr
gc.collect()
with h5py.File('/kaggle/working/statistics.h5', 'w') as f:
    # Assuming mean_vector and variance_vector are numpy arrays or can be converted to such
    f.create_dataset('mean_vector', data=mean_vector)
    f.create_dataset('variance_vector', data=variance_vector)

from scipy.spatial.distance import cosine
from numpy.linalg import norm

cos_sims = []
mean_vector = np.array(mean_vector).flatten()
with h5py.File('/kaggle/working/matrix_csr.h5', 'r') as f:
    # Reconstruct the CSR matrix
    data = f['data'][:]
    indices = f['indices'][:]
    indptr = f['indptr'][:]
    shape = f['shape'][:]  # shape is stored as a dataset; make sure to extract it correctly
    
    matrix_csr = csr_matrix((data, indices, indptr), shape=shape)
    
mean_vector_norm = mean_vector / norm(mean_vector)

# Precompute norms for all graph vectors (rows of the CSR matrix)
graph_norms = np.linalg.norm(matrix_csr, axis=1)

# Calculate dot products of each graph vector with the mean vector efficiently
dot_products = matrix_csr.dot(mean_vector_norm)

# Compute cosine similarity for each graph
cos_sims = dot_products / graph_norms