In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import h5py
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ppa-vectors/hash_to_idx.h5
/kaggle/input/ppa-vectors/__results__.html
/kaggle/input/ppa-vectors/all_hashes.h5
/kaggle/input/ppa-vectors/__notebook__.ipynb
/kaggle/input/ppa-vectors/__output__.json
/kaggle/input/ppa-vectors/matrix_csr.h5
/kaggle/input/ppa-vectors/statistics.h5
/kaggle/input/ppa-vectors/custom.css
/kaggle/input/ppa-dataset/__results__.html
/kaggle/input/ppa-dataset/graphidx2speciesid.csv
/kaggle/input/ppa-dataset/valid.csv
/kaggle/input/ppa-dataset/graph_data.hdf5
/kaggle/input/ppa-dataset/__notebook__.ipynb
/kaggle/input/ppa-dataset/__output__.json
/kaggle/input/ppa-dataset/train.csv
/kaggle/input/ppa-dataset/test.csv
/kaggle/input/ppa-dataset/custom.css


In [2]:
import gc
gc.collect()

0

In [3]:
import h5py
import pandas as pd
import numpy as np
from scipy.sparse import dok_matrix
from tqdm import tqdm
import gc

# Load the embedding hashes
with h5py.File('/kaggle/input/ppa-vectors/all_hashes.h5', 'r') as f:
    embedding_hashes = list(f['all_hashes'])



In [4]:
embedding_hashes

[b'000000098189efb7f7cb0ba670b659c5',
 b'00000030b28a20377712568a13d97c7b',
 b'0000008ec1aa4b054319c72ea4dac89a',
 b'000000960b99fa2d6ea22989e02a2e13',
 b'000000b46609eaa50f3dd6ee49fbcb68',
 b'0000014f5bc89c391426b7d2a3f8c69b',
 b'000001523beac9f7b55a5e431e7ed45e',
 b'000001c51e52de3d8029bf888a3cf3bf',
 b'000001c8bf39eb3e8dab8d2be5fc5d7a',
 b'000001dcf0ad5a39a622907477f2d71b',
 b'00000219ceb056041cc4449d57cdcae7',
 b'0000027f2e901c682032acfa55d2a8da',
 b'000003066fff0880288a299287beb0a5',
 b'00000307599d64d3eeefa75268b19d04',
 b'0000030bf706f07124ccfb692e876973',
 b'0000031d10f94adcbc4da8a9697b215c',
 b'0000038ccf26988188d6abdf626a7528',
 b'000003c0213f250be0d9b346c338154e',
 b'0000042975a81f840dd781b18f506043',
 b'000004b8da94aa8a6520a359309f723d',
 b'000004d21478e1dfab4f5e07a0584250',
 b'00000500c47c1e27b6f4752dfe5001f9',
 b'0000051a484105f05c51e0b1ac66fd25',
 b'00000585e8e50ada46322bae664d1dc0',
 b'0000059fd494f7fb6e84107d0d7ca624',
 b'000005c103f7b6ed4d757c2745d257f6',
 b'000005cfc

In [5]:
gc.collect()

0

In [6]:
# Create a mapping from embedding hash to column index
hash_to_index = {hash_val: idx for idx, hash_val in enumerate(embedding_hashes)}

# Load graph names
with h5py.File('/kaggle/input/ppa-dataset/graph_data.hdf5', 'r') as f:
    graph_names = list(f.keys())

# Initialize a sparse dictionary of keys (DOK) matrix
n_graphs = len(graph_names)
n_hashes = len(embedding_hashes)
sparse_matrix = dok_matrix((n_graphs, n_hashes), dtype=np.int32)

# Create a DataFrame to hold labels
df_labels = pd.DataFrame(index=graph_names, columns=['label'], dtype=np.int32)

gc.collect()

0

In [7]:
# Populate the sparse matrix and labels DataFrame
with h5py.File('/kaggle/input/ppa-dataset/graph_data.hdf5', 'r') as f:
    for idx, graph_name in tqdm(enumerate(graph_names), total=n_graphs):
        embedding = f[graph_name]['embedding']
        label = f[graph_name].attrs['graph_label']
        df_labels.loc[graph_name, 'label'] = label
        for emb in embedding:
            embedding_hash = emb[0]  # assuming emb[0] is the embedding hash
            count = emb[1]           # assuming emb[1] is the count
            column_index = hash_to_index[embedding_hash]
            sparse_matrix[idx, column_index] = count

gc.collect()


100%|██████████| 158100/158100 [3:13:48<00:00, 13.60it/s]


0

In [8]:
gc.collect()

0

In [9]:
# Convert DOK matrix to CSR for efficient arithmetic and matrix operations
sparse_matrix_csr = sparse_matrix.tocsr()
del sparse_matrix
gc.collect()


0

In [10]:
# Calculate column sums and filter columns
threshold = 20
column_sums = np.array(sparse_matrix_csr.sum(axis=0)).flatten()  # Sum columns
columns_to_keep = np.where(column_sums > threshold)[0]  # Find indices of columns to keep
del column_sums
reduced_matrix_csr = sparse_matrix_csr[:, columns_to_keep]  #
del sparse_matrix_csr, columns_to_keep
gc.collect()
sparse_matrix_csr = reduced_matrix_csr

In [11]:
# To combine the labels with the sparse matrix for statistical learning:
from scipy.sparse import hstack

labels = df_labels['label'].values.reshape(-1, 1)
labels_sparse = dok_matrix(labels)
del labels
gc.collect()


0

In [12]:
labels_sparse_csr = labels_sparse.tocsr()
del labels_sparse
gc.collect()



0

In [13]:
# Combine feature matrix and labels
full_matrix = hstack([sparse_matrix_csr, labels_sparse_csr])
del sparse_matrix_csr, labels_sparse_csr
gc.collect()


0

In [14]:
# Save to disk for later use if needed
from scipy.sparse import save_npz
save_npz('full_sparse_matrix.npz', full_matrix)

In [15]:
# X = full_matrix[:, :-1]  # Features
# y = full_matrix[:, -1].toarray().ravel()  # Labels
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score

# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)

# # Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
