# 

# Build the training and testing SeqData's from already processed AI-ATAC dataset
After downloading the data from GitHub, we can prepare it in notebook for training. This will be the dataset with 300K peaks, each of length 251bp. The normalized counts for each celltype were derived as described in the [Immgen paper](https://www.sciencedirect.com/science/article/pii/S0092867418316507?via%3Dihub):

"To compute signal intensity in each OCR, reads mapped to the plus strand were shifted by +4 bp and reads mapped to the minus strand by −5 bp. Second, edges of fragments corresponding to paired reads were tested for OCR overlapping using BEDTools2.25.0 [bedtools intersect (Quinlan and Hall, 2010)]. A fragment edge in an OCR was counted unless the other edge of the fragment mapped to the same OCR in order to avoid counting non-independent Tn5 insertion events. A pseudo count of 0.1 was added to edge counts in peaks, log2-transformed and normalized by quantile normalization."

I believe this is our starting place.

# Set-up

In [52]:
import os
import json
import glob
import numpy as np
import seqpro as sp
import seqdata as sd
import pandas as pd
import xarray as xr

from eugene import preprocess as pp

In [64]:
# paths 
genome_path = "/cellar/users/aklie/data/ref/genomes/mm10/mm10.fa"
peaks_path = "/cellar/users/aklie/data/datasets/AI-ATAC/analysis/10Nov23/github/ImmGenATAC1219.peak_matched.bed"
peak_height_path = "/cellar/users/aklie/data/datasets/AI-ATAC/analysis/10Nov23/github/mouse_peak_heights.csv"
outdir_path = "/cellar/users/aklie/data/datasets/AI-ATAC/analysis/10Nov23/seqdata/"

# Build SeqData

In [65]:
# Load in the sequences
sdata = sd.read_genome_fasta(
    "seq",
    out=os.path.join(outdir_path, "ai-atac.zarr"),
    fasta=genome_path,
    bed=peaks_path,
    batch_size=10000,
    fixed_length=True,
    alphabet=sp.alphabets.DNA,
    overwrite=True,
)

100%|██████████| 327927/327927 [00:06<00:00, 51463.33it/s]


In [66]:
# Load in the peak heights that were used in the original paper
peak_heights = pd.read_csv(
    peak_height_path,
    index_col=0
)
vals = peak_heights.values
ids = peak_heights.index.values
vals.shape, ids.shape

((327927, 81), (327927,))

In [67]:
# Add peak_height as an array to the XArray
sdata['peak_height'] = xr.DataArray(vals, dims=['_sequence', '_celltypes'])
sdata["ids"] = xr.DataArray(ids, dims=['_sequence'])
sdata["cell_types"] = xr.DataArray(peak_heights.columns.values, dims=['_celltypes'])

In [68]:
# Need to upper case the seqs for ohe
sdata["seq"] = xr.DataArray(np.char.upper(sdata["seq"]), dims=["_sequence", "_length"])

In [69]:
# Check how many b'N' characters exist in "cleaned_seq"
(sdata["seq"] == b"N").sum().values

array(0)

In [70]:
# Check it
sdata

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 2.50 MiB 320.24 kiB Shape (327927,) (40991,) Dask graph 8 chunks in 2 graph layers Data type object numpy.ndarray",327927  1,

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 2.50 MiB 320.24 kiB Shape (327927,) (40991,) Dask graph 8 chunks in 2 graph layers Data type int64 numpy.ndarray",327927  1,

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 2.50 MiB 320.24 kiB Shape (327927,) (40991,) Dask graph 8 chunks in 2 graph layers Data type int64 numpy.ndarray",327927  1,

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 2.50 MiB 320.24 kiB Shape (327927,) (40991,) Dask graph 8 chunks in 2 graph layers Data type object numpy.ndarray",327927  1,

Unnamed: 0,Array,Chunk
Bytes,2.50 MiB,320.24 kiB
Shape,"(327927,)","(40991,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


# One-hot encode

In [71]:
# One-hot encode the sequences
pp.ohe_seqs_sdata(sdata, alphabet="DNA")

In [72]:
# Save the processed data
sd.to_zarr(sdata, os.path.join(outdir_path, "ai-atac_processed.zarr"))



# Perform train-test splitting

In [73]:
# Grab the splits
splitdir_path = "/cellar/users/aklie/data/ref/genomes/mm39/splits"
split_paths = sorted(glob.glob(os.path.join(splitdir_path, "*.json")))
split_paths, len(split_paths)

(['/cellar/users/aklie/data/ref/genomes/mm39/splits/fold_0.json',
  '/cellar/users/aklie/data/ref/genomes/mm39/splits/fold_1.json',
  '/cellar/users/aklie/data/ref/genomes/mm39/splits/fold_2.json',
  '/cellar/users/aklie/data/ref/genomes/mm39/splits/fold_3.json',
  '/cellar/users/aklie/data/ref/genomes/mm39/splits/fold_4.json'],
 5)

In [77]:
for i, split_path in enumerate(split_paths):
    
    # Load in the split
    split_dict = json.load(open(split_path, 'r'))

    # Copy the sdata
    sdata_cp = sdata.copy()

    # Split by test chromosomes for current fold
    pp.train_test_chrom_split(
        sdata=sdata_cp,
        test_chroms=split_dict['test'],
        train_var=f"fold_{i}",
    )
    # Split into train and test
    sdata_train = sdata.sel(_sequence=sdata[f"fold_{i}"] == True)
    sdata_test = sdata.sel(_sequence=sdata[f"fold_{i}"] == False)

    # Split train into train and valid
    pp.train_test_chrom_split(
        sdata=sdata_train,
        test_chroms=split_dict['valid'],
        train_var=f"fold_{i}_train",
    )
    

    # Create new fold directory
    if not os.path.exists(os.path.join(outdir_path, f"fold_{i}")):
        os.makedirs(os.path.join(outdir_path, f"fold_{i}"))

    # Save this split to that directory
    sd.to_zarr(sdata_train, os.path.join(outdir_path, f"fold_{i}", "ai-atac_train.zarr"))
    sd.to_zarr(sdata_test, os.path.join(outdir_path, f"fold_{i}", "ai-atac_test.zarr"))



# DONE!

---