In [9]:
import muon
import scanpy as sc
import zarr
import seqdata as sd
import xarray as xr
from anndata.experimental.pytorch import AnnLoader
import torch
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import tempfile
import anndata
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Optional, Union


def _dna_to_code(nt: str) -> int:
    if nt == "A":
        return 0
    elif nt == "C":
        return 1
    elif nt == "G":
        return 2
    elif nt == "T":
        return 3
    else:
        # scBasset does this
        return np.random.randint(0, 3)
    

def add_dna_sequence(
    adata: anndata.AnnData,
    seq_len: int = 1344,
    genome_name: str = "hg38",
    genome_dir: Optional[Path] = None,
    genome_provider: Optional[str] = None,
    install_genome: bool = True,
    chr_var_key: str = "chr",
    start_var_key: str = "start",
    end_var_key: str = "end",
    sequence_varm_key: str = "dna_sequence",
    code_varm_key: str = "dna_code",
) -> None:
    """Add DNA sequence to AnnData object.

    Uses genomepy under the hood to download the genome.

    Parameters
    ----------
    adata
        AnnData object with chromatin accessiblity data
    seq_len
        Length of DNA sequence to extract around peak center.
        Defaults to value used in scBasset.
    genome_name
        Name of genome to use, installed with genomepy
    genome_dir
        Directory to install genome to, if not already installed
    genome_provider
        Provider of genome, passed to genomepy
    install_genome
        Install the genome with genomepy. If False, `genome_provider` is not used,
        and a genome is loaded with `genomepy.Genome(genome_name, genomes_dir=genome_dir)`
    chr_var_key
        Key in `.var` for chromosome
    start_var_key
        Key in `.var` for start position
    end_var_key
        Key in `.var` for end position
    sequence_varm_key
        Key in `.varm` for added DNA sequence
    code_varm_key
        Key in `.varm` for added DNA sequence, encoded as integers

    Returns
    -------
    None

    Adds fields to `.varm`:
        sequence_varm_key: DNA sequence
        code_varm_key: DNA sequence, encoded as integers
    """
    import genomepy

    if genome_dir is None:
        tempdir = tempfile.TemporaryDirectory()
        genome_dir = tempdir.name

    if install_genome:
        g = genomepy.install_genome(genome_name, genome_provider, genomes_dir=genome_dir)
    else:
        g = genomepy.Genome(genome_name, genomes_dir=genome_dir)

    chroms = adata.var[chr_var_key].unique()
    df = adata.var[[chr_var_key, start_var_key, end_var_key]]
    seq_dfs = []

    for chrom in chroms:
        chrom_df = df[df[chr_var_key] == chrom]
        block_mid = (chrom_df[start_var_key] + chrom_df[end_var_key]) // 2
        block_starts = block_mid - (seq_len // 2)
        block_ends = block_starts + seq_len
        seqs = []

        for start, end in zip(block_starts, block_ends - 1):
            seq = str(g.get_seq(chrom, start, end)).upper()
            seqs.append(list(seq))

        assert len(seqs) == len(chrom_df)
        seq_dfs.append(pd.DataFrame(seqs, index=chrom_df.index))

    sequence_df = pd.concat(seq_dfs, axis=0).loc[adata.var_names]
    adata.varm[sequence_varm_key] = sequence_df
    adata.varm[code_varm_key] = sequence_df.applymap(_dna_to_code)

In [2]:
url = "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/10k_PBMC_Multiome_nextgem_Chromium_X/10k_PBMC_Multiome_nextgem_Chromium_X_filtered_feature_bc_matrix.h5"
mdata = muon.read_10x_h5("data/multiome10k.h5mu", backup_url=url)

  0%|          | 0.00/159M [00:00<?, ?B/s]

  utils.warn_names_duplicates("var")


Added `interval` annotation for features from data/multiome10k.h5mu




In [3]:
adata = mdata.mod["atac"]

In [5]:
print(adata.shape)
# compute the threshold: 5% of the cells
min_cells = int(adata.shape[0] * 0.05)
# in-place filtering of regions
sc.pp.filter_genes(adata, min_cells=min_cells)
print(adata.shape)

(10970, 111743)
(10970, 37054)


In [6]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,interval,n_cells
chr1:629395-630394,chr1:629395-630394,Peaks,GRCh38,chr1:629395-630394,1422
chr1:633578-634591,chr1:633578-634591,Peaks,GRCh38,chr1:633578-634591,4536
chr1:778283-779200,chr1:778283-779200,Peaks,GRCh38,chr1:778283-779200,5981
chr1:816873-817775,chr1:816873-817775,Peaks,GRCh38,chr1:816873-817775,564
chr1:827067-827949,chr1:827067-827949,Peaks,GRCh38,chr1:827067-827949,3150
...,...,...,...,...,...
GL000219.1:44739-45583,GL000219.1:44739-45583,Peaks,GRCh38,GL000219.1:44739-45583,781
GL000219.1:45726-46446,GL000219.1:45726-46446,Peaks,GRCh38,GL000219.1:45726-46446,639
GL000219.1:99267-100169,GL000219.1:99267-100169,Peaks,GRCh38,GL000219.1:99267-100169,6830
KI270726.1:41483-42332,KI270726.1:41483-42332,Peaks,GRCh38,KI270726.1:41483-42332,605


In [7]:
split_interval = adata.var["gene_ids"].str.split(":", expand=True)
adata.var["chr"] = split_interval[0]
split_start_end = split_interval[1].str.split("-", expand=True)
adata.var["start"] = split_start_end[0].astype(int)
adata.var["end"] = split_start_end[1].astype(int)
adata.var

Unnamed: 0,gene_ids,feature_types,genome,interval,n_cells,chr,start,end
chr1:629395-630394,chr1:629395-630394,Peaks,GRCh38,chr1:629395-630394,1422,chr1,629395,630394
chr1:633578-634591,chr1:633578-634591,Peaks,GRCh38,chr1:633578-634591,4536,chr1,633578,634591
chr1:778283-779200,chr1:778283-779200,Peaks,GRCh38,chr1:778283-779200,5981,chr1,778283,779200
chr1:816873-817775,chr1:816873-817775,Peaks,GRCh38,chr1:816873-817775,564,chr1,816873,817775
chr1:827067-827949,chr1:827067-827949,Peaks,GRCh38,chr1:827067-827949,3150,chr1,827067,827949
...,...,...,...,...,...,...,...,...
GL000219.1:44739-45583,GL000219.1:44739-45583,Peaks,GRCh38,GL000219.1:44739-45583,781,GL000219.1,44739,45583
GL000219.1:45726-46446,GL000219.1:45726-46446,Peaks,GRCh38,GL000219.1:45726-46446,639,GL000219.1,45726,46446
GL000219.1:99267-100169,GL000219.1:99267-100169,Peaks,GRCh38,GL000219.1:99267-100169,6830,GL000219.1,99267,100169
KI270726.1:41483-42332,KI270726.1:41483-42332,Peaks,GRCh38,KI270726.1:41483-42332,605,KI270726.1,41483,42332


In [8]:
# Filter out non-chromosomal regions
mask = adata.var["chr"].str.startswith("chr")
adata = adata[:, mask].copy()

In [11]:
add_dna_sequence(
    adata,
    genome_name="GRCh38",
    genome_dir="data",
    chr_var_key="chr",
    start_var_key="start",
    end_var_key="end",
)
adata

[32m08:47:48[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading assembly summaries from GENCODE
[32m08:48:18[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading assembly summaries from UCSC
[32m08:48:19[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from GENCODE. Target URL: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz...


Download:   0%|          | 0.00/938M [00:00<?, ?B/s]

[32m08:48:29[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m08:48:43[0m [1m|[0m [34mINFO[0m [1m|[0m name: hg38
[32m08:48:43[0m [1m|[0m [34mINFO[0m [1m|[0m local name: GRCh38
[32m08:48:43[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /cellar/users/aklie/data/GRCh38/GRCh38.fa


Filtering Fasta: 0.00 lines [00:00, ? lines/s]

AnnData object with n_obs × n_vars = 10970 × 37042
    var: 'gene_ids', 'feature_types', 'genome', 'interval', 'n_cells', 'chr', 'start', 'end'
    varm: 'dna_sequence', 'dna_code'

In [15]:
adata.varm["dna_sequence"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1334,1335,1336,1337,1338,1339,1340,1341,1342,1343
chr1:629395-630394,C,A,C,T,C,T,C,C,C,C,...,C,T,A,T,A,T,C,T,A,A
chr1:633578-634591,G,A,A,A,T,A,G,G,G,C,...,T,A,A,A,T,C,C,C,C,T
chr1:778283-779200,C,G,C,C,C,G,G,C,T,A,...,G,A,C,A,G,G,A,G,T,T
chr1:816873-817775,A,A,T,T,C,A,T,A,T,G,...,T,T,A,G,C,G,G,C,T,G
chr1:827067-827949,C,T,C,T,C,C,T,G,C,C,...,C,G,T,T,A,T,T,A,A,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY:19077075-19078016,A,C,G,A,C,C,T,C,C,C,...,C,A,T,A,G,T,T,C,T,A
chrY:19567013-19567787,G,G,A,G,T,C,T,G,G,G,...,T,C,T,C,T,T,C,G,T,T
chrY:19744368-19745303,T,A,T,T,T,T,T,G,T,C,...,A,T,G,T,G,G,A,A,A,T
chrY:20575244-20576162,T,T,T,A,C,T,G,T,C,T,...,G,A,G,T,G,T,A,A,C,A


In [17]:
adata.varm["dna_code"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1334,1335,1336,1337,1338,1339,1340,1341,1342,1343
chr1:629395-630394,1,0,1,3,1,3,1,1,1,1,...,1,3,0,3,0,3,1,3,0,0
chr1:633578-634591,2,0,0,0,3,0,2,2,2,1,...,3,0,0,0,3,1,1,1,1,3
chr1:778283-779200,1,2,1,1,1,2,2,1,3,0,...,2,0,1,0,2,2,0,2,3,3
chr1:816873-817775,0,0,3,3,1,0,3,0,3,2,...,3,3,0,2,1,2,2,1,3,2
chr1:827067-827949,1,3,1,3,1,1,3,2,1,1,...,1,2,3,3,0,3,3,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY:19077075-19078016,0,1,2,0,1,1,3,1,1,1,...,1,0,3,0,2,3,3,1,3,0
chrY:19567013-19567787,2,2,0,2,3,1,3,2,2,2,...,3,1,3,1,3,3,1,2,3,3
chrY:19744368-19745303,3,0,3,3,3,3,3,2,3,1,...,0,3,2,3,2,2,0,0,0,3
chrY:20575244-20576162,3,3,3,0,1,3,2,3,1,3,...,2,0,2,3,2,3,0,0,1,0


In [13]:
bdata = adata.transpose()
bdata.layers["binary"] = (bdata.X.copy() > 0).astype(float)

In [63]:
adata = sc.datasets.pbmc3k()
adata

  0%|          | 0.00/5.58M [00:00<?, ?B/s]

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'

In [71]:
adata.obs['size_factors'] = adata.X.sum(1)

In [72]:
use_cuda = torch.cuda.is_available()

In [74]:
dataloader = AnnLoader(adata, batch_size=128, shuffle=True, use_cuda=use_cuda)

In [75]:
dataloader.dataset

AnnCollection object with n_obs × n_vars = 2700 × 32738
  constructed from 1 AnnData objects
    obs: 'size_factors'

In [76]:
dataloader.dataset[:10]

AnnCollectionView object with n_obs × n_vars = 10 × 32738
    obs: 'size_factors'

In [77]:
batch = dataloader.dataset[:10]

print('X:', batch.X.device, batch.X.dtype)

X: cuda:0 torch.float32


In [83]:
batch.X.shape

torch.Size([10, 32738])

In [55]:
adata.write_zarr("/cellar/users/aklie/data/datasets/pbmc3k/pbmc3k.zarr")
zarr.consolidate_metadata("/cellar/users/aklie/data/datasets/pbmc3k/pbmc3k.zarr")

  warn("ignoring keyword argument %r" % k)


<zarr.hierarchy.Group '/'>

In [56]:
zarr.open_consolidated("/cellar/users/aklie/data/datasets/pbmc3k/pbmc3k.zarr")

<zarr.hierarchy.Group '/'>

In [57]:
ad.read_zarr("/cellar/users/aklie/data/datasets/pbmc3k/pbmc3k.zarr")

AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain'
    var: 'n_cells'

In [58]:
xr.open_zarr?

[0;31mSignature:[0m
[0mxr[0m[0;34m.[0m[0mopen_zarr[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mstore[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msynchronizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunks[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecode_cf[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask_and_scale[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecode_times[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconcat_characters[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecode_coords[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdrop_variables[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconsolidated[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m

In [59]:
xr.open_zarr("/cellar/users/aklie/data/datasets/pbmc3k/pbmc3k.zarr")

TypeError: the JSON object must be str, bytes or bytearray, not dict

In [51]:
sd.open_zarr("/cellar/users/aklie/data/datasets/pbmc3k/pbmc3k.zarr")

TypeError: the JSON object must be str, bytes or bytearray, not dict