# Build a SeqData object from a pycisTopic run

In [20]:
import os
import numpy as np
import pandas as pd
import seqdata as sd

from eugene import preprocess as pp

In [40]:
dataset_name = "multiome_cells_all_peaks"
data_dir = f"/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/{dataset_name}"
output_dir = f"/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/{dataset_name}"

In [41]:
targets = np.load(os.path.join(output_dir, dataset_name + "_labels.npy"))
names = np.load(os.path.join(output_dir, dataset_name + "_regions.npy"), allow_pickle=True)
seqs = np.load(os.path.join(output_dir, dataset_name + "_seqs.npy"), allow_pickle=True)

In [42]:
topic_names = [f"Topic{i+1}" for i in range(targets.shape[1])]
topic_df = pd.DataFrame(data=targets, index=names, columns=topic_names)

In [43]:
sdata = sd.SeqData(
    seqs=seqs,
    names=names,
    seqs_annot=topic_df
)

In [44]:
pp.sanitize_seqs_sdata(sdata)

SeqData object modified:
	seqs: ['NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccaaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaaccctaaaccctaaccctaaccctaaccctaaccctaaccccaaccccaaccccaaccccaaccccaaccccaaccctaacccctaaccctaaccctaaccct'
 'taaatccgaacctgaacccgaaccctaaccataacccaaacccgaacccaaaccctaacccctaacccctaaccctaaccctaccctaacccaaccctaacccaaccctaactctagccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacaccctaaccctaaccctaaccctaaccctaaccctaaccctaacaaccctaaccctaaccctaacaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaaccctaaaccctaaccctaaccctaaccctaaccctaacccctaacccctaaccctaaccctaaccctaaccctaaccctaaccctcgcggtaccctcagccggcccgcccgcccgggtctgacctgaggagaactgtgctccgccttcagagtaccaccgaaatctgtgcagaggacaac

In [45]:
# Get boolean mask for any seqs with N
mask = np.array([False if "N" in seq else True for seq in sdata.seqs])
sdata = sdata[mask]

In [46]:
sdata.n_obs

149347

In [47]:
pp.add_ranges_sdata(sdata)

SeqData object modified:
    seqs_annot:
        + chr, end, start


In [48]:
pp.ohe_seqs_sdata(sdata)

HBox(children=(FloatProgress(value=0.0, description='One-hot encoding sequences', max=149347.0, style=Progress…


SeqData object modified:
	ohe_seqs: None -> 149347 ohe_seqs added


In [30]:
pp.train_test_split_sdata(sdata, train_key="train_test", chr="chr2")

SeqData object modified:
    seqs_annot:
        + train_test


In [31]:
train_sdata = sdata[sdata["train_test"]]
test_sdata = sdata[~sdata["train_test"]]
train_sdata.n_obs, test_sdata.n_obs

(170664, 14888)

In [32]:
pp.train_test_split_sdata(train_sdata, train_key="train_val", chr="chr3")

SeqData object modified:
    seqs_annot:
        + train_val


In [33]:
train_sdata.write_h5sd(os.path.join(output_dir, dataset_name + "_train.h5sd"))
test_sdata.write_h5sd(os.path.join(output_dir, dataset_name + "_test.h5sd"))

## Script

In [57]:
%%bash
source activate eugene_dev
script=/cellar/users/aklie/projects/ML4GLand/collabs/er_stress_regulation/scripts/build_seqdata.py
dataset_name=beta_cells_all_peaks
data_dir=/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/$dataset_name
output_dir=/cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/$dataset_name
cmd="python $script \
    --dataset_name $dataset_name \
    --data_dir $data_dir \
    --output_dir $output_dir"
echo $cmd
$cmd

python /cellar/users/aklie/projects/ML4GLand/collabs/er_stress_regulation/scripts/build_seqdata.py --dataset_name beta_cells_all_peaks --data_dir /cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/beta_cells_all_peaks --output_dir /cellar/users/aklie/data/ml4gland/collabs/er_stress_regulation/beta_cells_all_peaks
Loading targets, names, and seqs from npy files
Preprocessing
SeqData object modified:
	seqs: ['NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccaaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaaccctaaaccctaaccctaaccctaaccctaaccctaaccccaaccccaaccccaaccccaaccccaaccccaaccctaacccctaaccctaaccctaaccct'
 'taaatccgaacctgaacccgaaccctaaccataacccaaacccgaacccaaaccctaacccctaacccctaac

One-hot encoding sequences: 100%|██████████| 134632/134632 [00:21<00:00, 6399.67it/s]


---

# Scratch