# Process data for training CATLAS binary ATAC-seq model

## Set up W&B

In [1]:
import wandb
import os
import anndata
import pandas as pd
import numpy as np
import bioframe as bf
from grelu.sequence.utils import resize
from grelu.data.preprocess import filter_blacklist, filter_chromosomes

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.login(host="https://api.wandb.ai")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mavantikalal[0m ([33mgrelu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
project_name='human-atac-catlas'

In [4]:
run = wandb.init(
    entity='grelu', project=project_name, job_type='preprocessing', name='prep',
    settings=wandb.Settings(
        program_relpath='1_data.ipynb',
        program_abspath='/code/github/gReLU-applications/catlas/1_data.ipynb')
)

## Load CATLAS snATAC-seq matrix

In [5]:
#!wget http://catlas.org/catlas_downloads/humantissues/cCRE_by_cell_type/matrix.tsv.gz

In [6]:
ad = anndata.read_mtx('matrix.tsv.gz').T

# Prepare ad.obs
ad.obs = pd.read_table('http://catlas.org/catlas_downloads/humantissues/cCRE_by_cell_type/celltypes.txt.gz', header=None, names=['cell type'])
ad.obs_names = ad.obs['cell type']

# Prepare ad.var
var = pd.read_table('http://catlas.org/catlas_downloads/humantissues/cCRE_hg38.tsv.gz')
var.columns = ['chrom', 'start', 'end', 'cre_class', 'in_fetal', 'in_adult', 'cre_module']
var["width"] = (var.end - var.start).astype(int)
var.index = var.index.astype(str)
ad.var = var

print(ad.shape)



(222, 1154611)


In [7]:
print(ad.var.head())
print(ad.obs.head())

  chrom   start     end          cre_class in_fetal in_adult  cre_module  \
0  chr1    9955   10355  Promoter Proximal      yes      yes         146   
1  chr1   29163   29563           Promoter      yes      yes          37   
2  chr1   79215   79615             Distal       no      yes          75   
3  chr1  102755  103155             Distal       no      yes          51   
4  chr1  115530  115930             Distal      yes       no          36   

   width  
0    400  
1    400  
2    400  
3    400  
4    400  
                                   cell type
cell type                                   
Follicular                        Follicular
Fibro General                  Fibro General
Acinar                                Acinar
T Lymphocyte 1 (CD8+)  T Lymphocyte 1 (CD8+)
T lymphocyte 2 (CD4+)  T lymphocyte 2 (CD4+)


## Filter peaks

In [8]:
ad = filter_chromosomes(ad, 'autosomes')

Keeping 1121319 intervals


In [9]:
ad = filter_blacklist(ad, genome='hg38')

Keeping 1121319 intervals


In [10]:
# Drop peaks that are only accessible in few cell types
print(ad.shape)
ad = ad[ad.X.mean(axis=1) > .03, :]
print(ad.shape)

(222, 1121319)
(204, 1121319)


## Resize peaks

In [11]:
seq_len = 200
ad.var = resize(ad.var, seq_len)
ad.var.head(3)

Unnamed: 0,chrom,start,end,cre_class,in_fetal,in_adult,cre_module,width
0,chr1,10055,10255,Promoter Proximal,yes,yes,146,400
1,chr1,29263,29463,Promoter,yes,yes,37,400
2,chr1,79315,79515,Distal,no,yes,75,400


## Load enformer splits

In [12]:
artifact = run.use_artifact('enformer/human_intervals:latest')
dir = artifact.download()
enformer_intervals = pd.read_table(os.path.join(dir, "data.tsv"))
enformer_intervals.head(3)

[34m[1mwandb[0m:   1 of 1 files downloaded.  


Unnamed: 0,chrom,start,end,split
0,chr18,895618,1092226,train
1,chr4,113598179,113794787,train
2,chr11,18394952,18591560,train


## Split peaks based on their overlap with enformer

In [13]:
ad.var = ad.var.reset_index(drop=True)

In [14]:
ad.var['cre_idx'] = range(len(ad.var))

In [15]:
overlaps = bf.overlap(ad.var, enformer_intervals, how='left')
overlaps.split_ = overlaps.split_.fillna('None')

overlaps = overlaps.groupby('cre_idx').split_.apply(lambda x: ''.join(list(np.unique(x))))
overlaps.value_counts()

split_
train         966953
test           72507
valid          71703
None            9670
testtrain        222
trainvalid       169
testvalid         95
Name: count, dtype: int64

In [16]:
assert np.all(overlaps.index == ad.var.cre_idx)

In [17]:
new_splits = np.array(['train'] * len(overlaps))
new_splits[[(('valid' in x) and ('train' not in x)) for x in overlaps]] = 'valid'
new_splits[[(('test' in x) and ('train' not in x) and ('valid' not in x)) for x in overlaps]] = 'test'
pd.Series(new_splits).value_counts()

train    977014
test      72507
valid     71798
Name: count, dtype: int64

In [18]:
ad.var['enformer_split'] = overlaps
ad.var['split'] = new_splits

## Save

In [19]:
ad.write_h5ad('preprocessed.h5ad')

In [20]:
artifact = wandb.Artifact('dataset', type='dataset')
artifact.add_file(local_path='preprocessed.h5ad', name='data.h5ad')
run.log_artifact(artifact)

<Artifact dataset>

In [21]:
run.finish()