In [1]:
import anndata
import os
import importlib
import pandas as pd
import numpy as np
import wandb
%matplotlib inline

## Set experiment parameters

In [2]:
project_name='human-atac-catlas'
if not os.path.exists(project_name):
    os.makedirs(project_name)

In [3]:
wandb.login(host="https://api.wandb.ai")

[34m[1mwandb[0m: Currently logged in as: [33mavantikalal[0m ([33mgrelu[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
run = wandb.init(
    entity='grelu', project=project_name, job_type='preprocessing', name='prep'
)

## Load data

In [5]:
!wget http://catlas.org/catlas_downloads/humantissues/cCRE_by_cell_type/matrix.tsv.gz
ad = anndata.read_mtx('matrix.tsv.gz').T

# Prepare ad.obs
ad.obs = pd.read_table('http://catlas.org/catlas_downloads/humantissues/cCRE_by_cell_type/celltypes.txt.gz', header=None, names=['cell type'])
ad.obs_names = ad.obs['cell type']

# Prepare ad.var
var = pd.read_table('http://catlas.org/catlas_downloads/humantissues/cCRE_hg38.tsv.gz')
var.columns = ['chrom', 'start', 'end', 'cre_class', 'in_fetal', 'in_adult', 'cre_module']
var["width"] = (var.end - var.start).astype(int)
var.index = var.index.astype(str)
ad.var = var

print(ad.shape)

--2024-05-27 17:25:47--  http://catlas.org/catlas_downloads/humantissues/cCRE_by_cell_type/matrix.tsv.gz
Resolving catlas.org (catlas.org)... 132.239.162.129
Connecting to catlas.org (catlas.org)|132.239.162.129|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38772708 (37M) [application/x-gzip]
Saving to: ‘matrix.tsv.gz’


2024-05-27 17:25:47 (72.2 MB/s) - ‘matrix.tsv.gz’ saved [38772708/38772708]

(222, 1154611)


In [6]:
ad.var.head()

Unnamed: 0,chrom,start,end,cre_class,in_fetal,in_adult,cre_module,width
0,chr1,9955,10355,Promoter Proximal,yes,yes,146,400
1,chr1,29163,29563,Promoter,yes,yes,37,400
2,chr1,79215,79615,Distal,no,yes,75,400
3,chr1,102755,103155,Distal,no,yes,51,400
4,chr1,115530,115930,Distal,yes,no,36,400


In [7]:
ad.obs.head()

Unnamed: 0_level_0,cell type
cell type,Unnamed: 1_level_1
Follicular,Follicular
Fibro General,Fibro General
Acinar,Acinar
T Lymphocyte 1 (CD8+),T Lymphocyte 1 (CD8+)
T lymphocyte 2 (CD4+),T lymphocyte 2 (CD4+)


## Filter peaks

In [8]:
import grelu.data.preprocess

ad = grelu.data.preprocess.filter_chromosomes(ad, 'autosomes')

Keeping 1121319 intervals


In [9]:
ad = grelu.data.preprocess.filter_blacklist(ad, genome='hg38')

  from .autonotebook import tqdm as notebook_tqdm


Keeping 1121319 intervals


In [10]:
print(ad.shape)
ad = ad[ad.X.mean(axis=1) > .03, :]
print(ad.shape)

(222, 1121319)
(204, 1121319)


In [11]:
import grelu.sequence.utils
seq_len = 200

ad.var = grelu.sequence.utils.resize(ad.var, seq_len)
ad.var.head(3)

Unnamed: 0,chrom,start,end,cre_class,in_fetal,in_adult,cre_module,width
0,chr1,10055,10255,Promoter Proximal,yes,yes,146,400
1,chr1,29263,29463,Promoter,yes,yes,37,400
2,chr1,79315,79515,Distal,no,yes,75,400


## Save

In [12]:
ad.write_h5ad(os.path.join(project_name, 'preprocessed.h5ad'))

In [13]:
artifact = wandb.Artifact('dataset', type='dataset')
artifact.add_file(os.path.join(project_name, 'preprocessed.h5ad'))
run.log_artifact(artifact)

<Artifact dataset>

In [14]:
run.finish()