In [1]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
#from tqdm.notebook import tqdm

In [2]:
def summarize_h5ad(ad, label, use_raw=False):

    # Set of metacells
    metacells = ad.obs[label].unique()

    # Summary matrix
    summ_matrix = pd.DataFrame(0.0, index=metacells, columns=ad.var_names)

    #for m in tqdm(summ_matrix.index):
    for m in summ_matrix.index:
        cells = ad.obs_names[ad.obs[label] == m]
        if use_raw:
            summ_matrix.loc[m, :] = np.ravel(ad[cells, :].raw.X.sum(axis=0))
        else:
            summ_matrix.loc[m, :] = np.ravel(ad[cells, :].X.sum(axis=0))

    # AnnData
    meta_ad = sc.AnnData(csr_matrix(summ_matrix), dtype=csr_matrix(summ_matrix).dtype)
    meta_ad.obs_names, meta_ad.var_names = summ_matrix.index.astype(str), ad.var_names
    return meta_ad

In [3]:
adata = sc.read('../../06_cre/data/01_load_CRE.h5ad')

In [5]:
meta = pd.read_csv('../data/metacell_v2.metacells.csv', index_col=0)

In [6]:
adata.obs = pd.merge(adata.obs, meta['mc'], left_index=True, right_index=True, how='left')

In [7]:
adata = summarize_h5ad(adata, 'mc', False)

In [8]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [9]:
adata = adata[:, adata.var.n_cells_by_counts >=3]

In [10]:
meta.reset_index(inplace=True)

In [11]:
del meta['barcode']
del meta['SEACell']
meta = meta.drop_duplicates()

In [12]:
meta = meta.set_index('mc')

In [13]:
adata.obs = pd.merge(adata.obs, meta, left_index=True, right_index=True, how='left')

In [14]:
adata

AnnData object with n_obs × n_vars = 3350 × 171507
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'Narrow_Celltype', 'Broad_Celltype'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [15]:
adata = adata[sorted(adata.obs_names)]

In [16]:
adata.raw = adata

In [15]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, 
                            flavor="seurat_v3")
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=20)
sc.tl.umap(adata)



In [None]:
adata = adata.raw.to_adata()

In [42]:
adata.write('../data/metacell_v2_cre.h5ad')

In [4]:
adata.obs['umap_1'] = adata.obsm['X_umap'][:,0]
adata.obs['umap_2'] = adata.obsm['X_umap'][:,1]

In [5]:
adata.obs.to_csv('../data/metacell_v2_cre.tsv.gz', index=True, index_label='metacell', sep='\t')

In [11]:
df = adata.X.todense()
df = pd.DataFrame(df.T, columns = adata.obs_names, index = adata.var_names)
df.to_csv('../data/metacell_v2_cre_sum.csv.gz', index=True, index_label='CRE')


In [3]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [18]:
df = adata.X.todense()
df = pd.DataFrame(df.T, columns = adata.obs_names, index = adata.var_names)
df.to_csv('../data/metacell_v2_cre_norm.csv.gz', index=True, index_label='CRE')

In [11]:
pca = pd.DataFrame(adata.obsm['X_pca'], index=adata.obs_names, columns=[f'PC{x+1}' for x in range(50)])
pca.to_csv('../data/metacell_v2_cre_pca.tsv.gz', sep='\t', index=True, index_label='metacell')