# Prepare EUGENE SeqData on `Buenrostro_2018` dataset
Adam Klie (last updated: *09/20/2023*)
***
This notebook shows how to take scBasset processed data and convert to EUGENe ready SeqDatas

# Set-up

In [None]:
# Load necessary packages
import os
import numpy as np
import h5py
import gc
import xarray as xr
import psutil
import anndata
import pickle
from scipy import sparse
import tensorflow as tf
from datetime import datetime

In [56]:
# a generator to read examples from h5 file to create a tf dataset
class generator:
    def __init__(self, file, m):
        self.file = file # h5 file for sequence
        self.m = m # csr matrix, rows as seqs, cols are cells
        self.n_cells = m.shape[1]
        self.ones = np.ones(1344)
        self.rows = np.arange(1344)

    def __call__(self):
        with h5py.File(self.file, 'r') as hf:
            X = hf['X']
            for i in range(X.shape[0]):
                x = X[i]
                x_tf = sparse.coo_matrix((self.ones, (self.rows, x)), 
                                               shape=(1344, 4), 
                                               dtype='int8').toarray()
                y = self.m.indices[self.m.indptr[i]:self.m.indptr[i+1]]
                y_tf = np.zeros(self.n_cells, dtype='int8')
                y_tf[y] = 1
                yield x_tf, y_tf

def print_memory():
    process = psutil.Process(os.getpid())
    print('cpu memory used: %.1fGB.'%(process.memory_info().rss/1e9))

In [4]:
# Set-up the paths to data (TODO: change to your own paths)
input_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/Buenrostro_2018/processed'
split_file = os.path.join(input_dir, 'splits.h5')
train_file = os.path.join(input_dir, 'train_seqs.h5')
val_file = os.path.join(input_dir, 'val_seqs.h5')
test_file = os.path.join(input_dir, 'test_seqs.h5')
ad_file = os.path.join(input_dir, 'atac_ad.h5ad')

# Load data

In [5]:
# Grab the sparse matrix from the anndata object
adata = anndata.read_h5ad(ad_file)
n_cells = adata.shape[0]
m = adata.X.tocoo().transpose().tocsr()

In [6]:
# Check memory usage
print_memory()
del adata
gc.collect()

cpu memory used: 1.2GB.


125

In [7]:
# Get the splits
with h5py.File(split_file, 'r') as hf:
    train_ids = hf['train_ids'][:]
    val_ids = hf['val_ids'][:]
    test_ids = hf['test_ids'][:]

In [8]:
# Split into train and val
m_train = m[train_ids,:]
m_val = m[val_ids,:]
m_test = m[test_ids,:]
del m
gc.collect()
m_train.shape, m_val.shape, m_test.shape

((27677, 2711), (1537, 2711))

In [57]:
# Create the tf datasets
train_ds = tf.data.Dataset.from_generator(
     generator(train_file, m_train), 
     output_signature=(
          tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
          tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
     )
).batch(m_train.shape[0])
val_ds = tf.data.Dataset.from_generator(
     generator(val_file, m_val), 
     output_signature=(
          tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
          tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
     )
).batch(m_val.shape[0])
test_ds = tf.data.Dataset.from_generator(
     generator(test_file, m_test),
     output_signature=(
          tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
          tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
     )
).batch(m_test.shape[0])

In [70]:
# Create the train seqdata
x_train, y_train = train_ds.take(1)
train_ohe_seq_xr = xr.DataArray(
    x_train.numpy(),
    dims=['_sequence', '_length', '_ohe'], 
    name='ohe_seq'
)
train_bin_counts_xr = xr.DataArray(
    y_train.numpy(),
    dims=['_sequence', '_target'],
    name='bin_counts'
)
sdata_train = xr.Dataset({'ohe_seq': train_ohe_seq_xr, 'bin_counts': train_bin_counts_xr})

# Create the val seqdata
x_val, y_val = val_ds.take(1)
val_ohe_seq_xr = xr.DataArray(
    x_val.numpy(),
    dims=['_sequence', '_length', '_ohe'],
    name='ohe_seq'
)
val_bin_counts_xr = xr.DataArray(
    y_val.numpy(),
    dims=['_sequence', '_target'],
    name='bin_counts'
)
sdata_val = xr.Dataset({'ohe_seq': val_ohe_seq_xr, 'bin_counts': val_bin_counts_xr})

# Create the test seqdata
x_test, y_test = test_ds.take(1)
test_ohe_seq_xr = xr.DataArray(
    x_test.numpy(),
    dims=['_sequence', '_length', '_ohe'],
    name='ohe_seq'
)
test_bin_counts_xr = xr.DataArray(
    y_test.numpy(),
    dims=['_sequence', '_target'],
    name='bin_counts'
)
sdata_test = xr.Dataset({'ohe_seq': test_ohe_seq_xr, 'bin_counts': test_bin_counts_xr})

(1537, 1344, 4) (1537, 2711)


In [None]:
# Save each to zarr
sdata_train.to_zarr(os.path.join(input_dir, 'train_seqdata.zarr'))
sdata_val.to_zarr(os.path.join(input_dir, 'val_seqdata.zarr'))
sdata_test.to_zarr(os.path.join(input_dir, 'test_seqdata.zarr'))

# DONE!

---