# Build a SeqData object from a pycisTopic run `pbmc-granulocyte-sorted-3k_10x-Multiome`
Adam Klie (last updated: *09/20/2023*)
***
This notebook shows how build a SeqData object from a pycisTopic run `pbmc-granulocyte-sorted-3k_10x-Multiome`. The SeqData object is then saved to a file.

# Set-up

In [7]:
# Load necessary packages
import os
import numpy as np
import seqdata as sd
import xarray as xr
from eugene import preprocess as pp

In [8]:
dataset_name = "pbmc-granulocyte-sorted-3k_10x-Multiome"
input_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed'

In [9]:
targets = np.load(os.path.join(input_dir, dataset_name + "_labels.npy"))
names = np.load(os.path.join(input_dir, dataset_name + "_regions.npy"), allow_pickle=True)
seqs = np.load(os.path.join(input_dir, dataset_name + "_seqs.npy"), allow_pickle=True)

In [10]:
# Make the sdata object
sdata = xr.Dataset(
    {
        "seqs": ("_sequence", seqs),
        "chr:start:end": ("_sequence", names),
        "topics": (["_sequence", "_topic"], targets)
    }
)

In [11]:
# Get boolean mask for any seqs with N and filter
mask = np.array([False if "N" in seq else True for seq in sdata["seqs"].values])
sdata = sdata.sel(_sequence=mask)

In [12]:
# Upper case all seqs
sdata["seqs"] = sdata["seqs"].str.upper()

In [13]:
# OHE seqs
pp.ohe_seqs_sdata(sdata, seq_key="seqs", ohe_key="ohe_seqs")

In [14]:
# Add chrom
sdata["chrom"] = xr.DataArray([i.split(":")[0] for i in sdata["chr:start:end"].values], dims="_sequence")

In [15]:
# Split into train and test
pp.train_test_chrom_split(
    sdata,
    train_key="train_test",
    test_chroms=["chr2"]
)

In [16]:
train_sdata = sdata.sel(_sequence=sdata["train_test"])
test_sdata = sdata.sel(_sequence=~sdata["train_test"])

In [17]:
pp.train_test_chrom_split(train_sdata, train_key="train_val", test_chroms=["chr3"])

In [18]:
sd.to_zarr(train_sdata, os.path.join(input_dir, dataset_name + ".train.zarr"))


In [19]:
sd.to_zarr(test_sdata, os.path.join(input_dir, dataset_name + ".test.zarr"))

# DONE!

---