# Build a SeqData object from a pycisTopic run

In [7]:
import os
import numpy as np
import seqdata as sd
import xarray as xr
from eugene import preprocess as pp

In [8]:
dataset_name = "multiome_cells_all_peaks"
data_dir = f"/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/{dataset_name}"
output_dir = f"/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/{dataset_name}"

In [9]:
targets = np.load(os.path.join(output_dir, dataset_name + "_labels.npy"))
names = np.load(os.path.join(output_dir, dataset_name + "_regions.npy"), allow_pickle=True)
seqs = np.load(os.path.join(output_dir, dataset_name + "_seqs.npy"), allow_pickle=True)

In [10]:
# Make the sdata object
sdata = xr.Dataset(
    {
        "seqs": ("_sequence", seqs),
        "chr:start:end": ("_sequence", names),
        "topics": (["_sequence", "_topic"], targets)
    }
)

In [11]:
# Get boolean mask for any seqs with N and filter
mask = np.array([False if "N" in seq else True for seq in sdata["seqs"].values])
sdata = sdata.sel(_sequence=mask)

In [12]:
# Upper case all seqs
sdata["seqs"] = sdata["seqs"].str.upper()

In [13]:
# OHE seqs
pp.ohe_seqs_sdata(sdata, seq_key="seqs", ohe_key="ohe_seqs")

In [14]:
# Add chrom
sdata["chrom"] = xr.DataArray([i.split(":")[0] for i in sdata["chr:start:end"].values], dims="_sequence")

In [15]:
# Split into train and test
pp.train_test_chrom_split(
    sdata,
    train_key="train_test",
    test_chroms=["chr2"]
)

In [16]:
train_sdata = sdata.sel(_sequence=sdata["train_test"])
test_sdata = sdata.sel(_sequence=~sdata["train_test"])

In [17]:
pp.train_test_chrom_split(train_sdata, train_key="train_val", test_chroms=["chr3"])

In [18]:
sd.to_zarr(train_sdata, os.path.join(output_dir, dataset_name + ".train.zarr"))


In [19]:
sd.to_zarr(test_sdata, os.path.join(output_dir, dataset_name + ".test.zarr"))

## Script

In [57]:
%%bash
source activate eugene_dev
script=/cellar/users/aklie/projects/ML4GLand/collabs/er_stress_regulation/scripts/build_seqdata.py
dataset_name=beta_cells_all_peaks
data_dir=/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/$dataset_name
output_dir=/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/$dataset_name
cmd="python $script \
    --dataset_name $dataset_name \
    --data_dir $data_dir \
    --output_dir $output_dir"
echo $cmd
$cmd

python /cellar/users/aklie/projects/ML4GLand/collabs/er_stress_regulation/scripts/build_seqdata.py --dataset_name beta_cells_all_peaks --data_dir /cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/beta_cells_all_peaks --output_dir /cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/beta_cells_all_peaks
Loading targets, names, and seqs from npy files
Preprocessing
SeqData object modified:
	seqs: ['NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccaaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaaccctaaaccctaaccctaaccctaaccctaaccctaaccccaaccccaaccccaaccccaaccccaaccccaaccctaacccctaaccctaaccctaaccct'
 'taaatccgaacctgaacccgaaccctaaccataacccaaacccgaacccaaaccctaacccctaacccctaac

One-hot encoding sequences: 100%|██████████| 134632/134632 [00:21<00:00, 6399.67it/s]


---

# Scratch