In [None]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import os, sys

sys.path.append('/code/decima/src/decima/')
import preprocess

from plotnine import *
%matplotlib inline

## Paths

In [None]:
save_dir="/gstore/data/resbioai/grelu/decima/20240823"
matrix_file = os.path.join(save_dir, "processed_pseudobulks/combined_inner.h5ad")

## Load count matrix

In [None]:
ad = anndata.read_h5ad(matrix_file)
print(ad.shape)

## Check NaNs

In [None]:
ad.var['frac_nan'] = np.isnan(ad.X).mean(0)

In [None]:
print(ad.shape)
ad = ad[:, ad.var.frac_nan < .33]
print(ad.shape)

In [None]:
ad.obs['frac_nan'] = np.isnan(ad.X).mean(1)

In [None]:
print(ad.shape)
ad = ad[ad.obs.frac_nan < .25]
print(ad.shape)

In [None]:
ad.X = np.nan_to_num(ad.X)

## Aggregate

In [None]:
%%time
ad = preprocess.aggregate_anndata(ad)
print(ad.shape)

In [None]:
ad.obs.loc[ad.obs.dataset!="skin_atlas", 'celltype_coarse'] = None

## Calculate per-track statistics

In [None]:
ad.obs['total_counts'] = ad.X.sum(1)
ad.obs['n_genes'] = np.sum(ad.X > 0, axis=1)

## Drop extremely low quality tracks

In [None]:
## Calculate low thresholds for genes, cells and total counts
for col in ["n_genes", "n_cells", "total_counts"]:
    print(col)
    for quantile in [.1, .2]:
        print(quantile, np.quantile(ad.obs[col], quantile))
    print("")

In [None]:
drop = (ad.obs.n_cells < 50) & (ad.obs.n_genes < 7670) & (ad.obs.total_counts < 76505)
ad = ad[~drop]
ad.shape

## Normalize data

In [None]:
ad.layers['counts'] = ad.X.copy()

In [None]:
sc.pp.normalize_total(ad, target_sum=1e6)

In [None]:
ad.layers['norm'] = ad.X.copy()

In [None]:
sc.pp.log1p(ad)

## Calculate reintroduced size factor

In [None]:
ad.obs['size_factor'] = ad.X.sum(1)

In [None]:
(
    ggplot(ad.obs, aes(x="size_factor")) + geom_density() + theme(figure_size=(5, 2))
)

## Add per-gene statistics

In [None]:
ad.var['mean_counts'] = ad.X.mean(0)
ad.var['n_tracks'] = np.sum(ad.X > 0, axis=0)

## z-score

In [None]:
ad_scaled = sc.pp.scale(ad, copy=True)
ad.layers['scaled'] = ad_scaled.X.copy()
del ad_scaled

## Count number of tuples

In [None]:
for col in ['dataset', 'study', 'cell_type', 'tissue', 'disease']:
    print(col)
    print(len(ad.obs[col].unique()))

In [None]:
print(len(ad.obs[['cell_type', 'tissue', 'disease', 'study']].drop_duplicates()))
print(len(ad.obs[['cell_type', 'tissue', 'disease']].drop_duplicates()))
print(len(ad.obs[['cell_type', 'tissue']].drop_duplicates()))

## Save

In [None]:
out_file = os.path.join(save_dir, "aggregated.h5ad")
#ad.write_h5ad(out_file)
#ad = sc.read_h5ad(out_file)