In [2]:
import os
import numpy as np
import h5py
import gc
import psutil
import anndata
import pickle
from scipy import sparse
import tensorflow as tf
from datetime import datetime

In [1]:
import seqdata as sd

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [56]:
# a generator to read examples from h5 file
# create a tf dataset
class generator:
    def __init__(self, file, m):
        self.file = file # h5 file for sequence
        self.m = m # csr matrix, rows as seqs, cols are cells
        self.n_cells = m.shape[1]
        self.ones = np.ones(1344)
        self.rows = np.arange(1344)

    def __call__(self):
        with h5py.File(self.file, 'r') as hf:
            X = hf['X']
            for i in range(X.shape[0]):
                x = X[i]
                x_tf = sparse.coo_matrix((self.ones, (self.rows, x)), 
                                               shape=(1344, 4), 
                                               dtype='int8').toarray()
                y = self.m.indices[self.m.indptr[i]:self.m.indptr[i+1]]
                y_tf = np.zeros(self.n_cells, dtype='int8')
                y_tf[y] = 1
                yield x_tf, y_tf

def print_memory():
    process = psutil.Process(os.getpid())
    print('cpu memory used: %.1fGB.'%(process.memory_info().rss/1e9))

In [4]:
input_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed'
split_file = os.path.join(input_dir, 'splits.h5')
train_file = os.path.join(input_dir, 'train_seqs.h5')
val_file = os.path.join(input_dir, 'val_seqs.h5')
test_file = os.path.join(input_dir, 'test_seqs.h5')
ad_file = os.path.join(input_dir, 'atac_ad.h5ad')

# Load data

In [5]:
# Grab the sparse matrix from the anndata object
adata = anndata.read_h5ad(ad_file)
n_cells = adata.shape[0]
m = adata.X.tocoo().transpose().tocsr()

In [6]:
print_memory()     # memory usage
del adata
gc.collect()

cpu memory used: 1.2GB.


125

In [7]:
# Get the splits
with h5py.File(split_file, 'r') as hf:
    train_ids = hf['train_ids'][:]
    val_ids = hf['val_ids'][:]

In [8]:
# Split into train and val
m_train = m[train_ids,:]
m_val = m[val_ids,:]
del m
gc.collect()
m_train.shape, m_val.shape

((27677, 2711), (1537, 2711))

In [None]:
# Open the train_file and print the keys available
val_h5 = h5py.File(val_file, 'r')
X_val = val_h5['X'][:]
val_h5.close()

In [57]:
# Create the tf datasets
train_ds = tf.data.Dataset.from_generator(
     generator(train_file, m_train), 
     output_signature=(
          tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
          tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
     )
)

In [68]:
val_ds = tf.data.Dataset.from_generator(
     generator(val_file, m_val), 
     output_signature=(
          tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
          tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
     )
).batch(m_val.shape[0])

In [63]:
val_ds

<FlatMapDataset shapes: ((1344, 4), (2711,)), types: (tf.int8, tf.int8)>

In [62]:
import tensorflow_datasets as tfds

ModuleNotFoundError: No module named 'tensorflow_datasets'

In [70]:
# Get an example batch from training dataset
for x, y in val_ds.take(1):
    print(x.shape, y.shape)

(1537, 1344, 4) (1537, 2711)


In [73]:
import xarray as xr

In [75]:
ohe_seq_xr = xr.DataArray(
    x.numpy(), 
    dims=['_sequence', '_length', '_ohe'], 
    name='ohe_seq'
)
bin_counts_xr = xr.DataArray(
    y.numpy(),
    dims=['_sequence', '_target'],
    name='bin_counts'
)

In [76]:
sdata_val = xr.Dataset({'ohe_seq': ohe_seq_xr, 'bin_counts': bin_counts_xr})

In [78]:
import seqpro as sp

In [100]:
test = ohe_seq_xr[:10].values.astype('int')

In [104]:
bin_counts_xr[:10].values.shape

(10, 2711)

In [77]:
sdata_val

In [107]:
from eugene.models.zoo import scBasset

In [112]:
model = scBasset(num_cells=m_val.shape[1], l1=0.01, l2=0.01)

In [113]:
import torch

In [118]:
x_tensor = torch.tensor(test, dtype=torch.float32).transpose(1,2)

In [120]:
model(x_tensor).shape

torch.Size([10, 2711])