In [2]:
!pip install --q anndata
!pip install --q scipy

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/122.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/122.4 kB[0m [31m696.3 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/122.4 kB[0m [31m768.2 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
print(ad.__version__)

0.10.7


In [4]:
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata

AnnData object with n_obs × n_vars = 100 × 2000

In [10]:
adata.X

<100x2000 sparse matrix of type '<class 'numpy.float32'>'
	with 126638 stored elements in Compressed Sparse Row format>

In [11]:
adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)]
print(adata.obs_names[:15])
print(adata.var_names[:15])

Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',
       'Cell_7', 'Cell_8', 'Cell_9', 'Cell_10', 'Cell_11', 'Cell_12',
       'Cell_13', 'Cell_14'],
      dtype='object')
Index(['Gene_0', 'Gene_1', 'Gene_2', 'Gene_3', 'Gene_4', 'Gene_5', 'Gene_6',
       'Gene_7', 'Gene_8', 'Gene_9', 'Gene_10', 'Gene_11', 'Gene_12',
       'Gene_13', 'Gene_14'],
      dtype='object')


In [14]:
adata[["Cell_1", "Cell_10"], ["Gene_5", "Gene_1900", "Gene_32"]]

View of AnnData object with n_obs × n_vars = 2 × 3

In [15]:
ct = np.random.choice(["B", "T", "Monocyte"], size=(adata.n_obs,))
adata.obs["cell_type"] = pd.Categorical(ct)  # Categoricals are preferred for efficiency
adata.obs

Unnamed: 0,cell_type
Cell_0,Monocyte
Cell_1,Monocyte
Cell_2,T
Cell_3,Monocyte
Cell_4,T
...,...
Cell_95,B
Cell_96,B
Cell_97,B
Cell_98,B


In [16]:
bdata = adata[adata.obs.cell_type == "B"]
bdata

View of AnnData object with n_obs × n_vars = 44 × 2000
    obs: 'cell_type'

In [17]:
adata.obsm["X_umap"] = np.random.normal(0, 1, size=(adata.n_obs, 2))
adata.varm["gene_stuff"] = np.random.normal(0, 1, size=(adata.n_vars, 5))
adata.obsm

AxisArrays with keys: X_umap

In [36]:
adata.layers["log_transformed"] = np.log1p(adata.X)
adata

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'
    obsm: 'X_umap'
    varm: 'gene_stuff'
    layers: 'log_transformed'

In [42]:
bdata.to_df(layer="log_transformed")

Unnamed: 0,Gene_0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_1990,Gene_1991,Gene_1992,Gene_1993,Gene_1994,Gene_1995,Gene_1996,Gene_1997,Gene_1998,Gene_1999
Cell_8,1.098612,0.0,1.098612,0.693147,0.693147,0.693147,0.0,1.098612,0.0,0.0,...,1.098612,0.693147,0.693147,0.693147,0.693147,1.386294,0.693147,0.693147,0.693147,0.0
Cell_10,0.0,0.693147,1.098612,1.098612,0.0,1.098612,0.0,0.0,1.386294,1.609438,...,0.0,0.0,0.693147,1.098612,0.0,0.693147,0.0,0.0,0.693147,0.0
Cell_11,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.693147,0.693147,0.693147,0.693147,...,0.0,0.0,0.693147,0.0,1.098612,1.098612,0.693147,0.0,0.0,0.693147
Cell_13,0.0,0.693147,0.693147,0.693147,0.693147,1.098612,0.693147,0.693147,0.0,0.693147,...,0.0,0.693147,0.693147,0.693147,1.098612,0.693147,0.693147,0.0,1.098612,0.0
Cell_14,1.098612,0.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.693147,1.386294,...,1.386294,0.0,0.0,0.0,0.0,1.098612,0.0,0.693147,0.0,0.0
Cell_16,0.693147,0.0,0.0,0.693147,0.693147,0.693147,0.0,0.693147,1.791759,1.098612,...,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147,0.0
Cell_22,0.693147,1.098612,0.0,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.0,0.693147,0.0,0.693147,0.693147,1.386294,0.693147,1.098612,0.693147
Cell_23,1.098612,0.0,0.693147,1.098612,0.693147,1.098612,0.0,0.0,0.0,0.693147,...,0.0,1.386294,0.693147,0.693147,0.0,0.693147,0.0,1.098612,0.693147,1.386294
Cell_29,1.386294,1.098612,0.693147,0.0,1.098612,1.098612,0.0,0.693147,0.693147,0.0,...,0.693147,0.0,0.693147,1.098612,0.0,1.386294,0.693147,0.693147,0.0,1.098612
Cell_32,1.609438,1.098612,0.0,0.693147,0.693147,0.693147,0.0,0.0,0.0,0.0,...,0.693147,0.0,1.098612,0.693147,0.0,0.0,1.098612,1.098612,0.693147,1.098612


In [43]:
adata.write('my_results.h5ad', compression="gzip")

In [None]:
!sudo apt install tree

In [53]:
!tree

[01;34m.[0m
├── [00mmy_results.h5ad[0m
└── [01;34msample_data[0m
    ├── [01;32manscombe.json[0m
    ├── [00mcalifornia_housing_test.csv[0m
    ├── [00mcalifornia_housing_train.csv[0m
    ├── [00mmnist_test.csv[0m
    ├── [00mmnist_train_small.csv[0m
    └── [01;32mREADME.md[0m

1 directory, 7 files


In [None]:
!sudo apt-get install hdf5-tools

In [60]:
!h5ls 'my_results.h5ad'

X                        Group
layers                   Group
obs                      Group
obsm                     Group
obsp                     Group
uns                      Group
var                      Group
varm                     Group
varp                     Group


In [63]:
obs_meta = pd.DataFrame({
        'time_yr': np.random.choice([0, 2, 4, 8], adata.n_obs),
        'subject_id': np.random.choice(['subject 1', 'subject 2', 'subject 4', 'subject 8'], adata.n_obs),
        'instrument_type': np.random.choice(['type a', 'type b'], adata.n_obs),
        'site': np.random.choice(['site x', 'site y'], adata.n_obs),
    },
    index=adata.obs.index,    # these are the same IDs of observations as above!
)
obs_meta

Unnamed: 0,time_yr,subject_id,instrument_type,site
Cell_0,0,subject 1,type a,site x
Cell_1,8,subject 8,type b,site x
Cell_2,4,subject 4,type a,site y
Cell_3,8,subject 1,type a,site y
Cell_4,2,subject 8,type b,site x
...,...,...,...,...
Cell_95,0,subject 4,type a,site y
Cell_96,2,subject 4,type b,site y
Cell_97,4,subject 8,type a,site y
Cell_98,0,subject 2,type b,site y


In [65]:
adata = ad.AnnData(adata.X, obs=obs_meta, var=adata.var)
adata

AnnData object with n_obs × n_vars = 100 × 2001
    obs: 'time_yr', 'subject_id', 'instrument_type', 'site'