In [None]:
import scanpy as sc
import seaborn as sns
import os
import numpy as np
from matplotlib import pyplot as plt

In [None]:
def is_outlier(adata, metric: str, upper: int, lower: int):
    M = adata.obs[metric]
    outlier = (M < np.percentile(M, lower)) | (
        M > np.percentile(M, upper)
    )

    return outlier


## QC 

We graph various aspects of the data such as % mt, counts, and umi. Filtering out cells that are low quality or otherwise impacting analysis. Note doublet detection is done late

In [None]:
adata=sc.read_h5ad("doublet_filtered.h5ad")


In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)
#graphing number of counts, pct mt and genes
# NOTE: n_genes_by_counts translates to the number of genes with at least one count in that cell
#multi panel so each has its own y axis

In [None]:
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt", )

In [None]:
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="cell_probability")#for cellbender computed probs

In [None]:
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="vaeda_scores") #doublet scores

In [None]:
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="vaeda_calls") #doublet scores

In [None]:
adata = adata[adata.obs["vaeda_calls"]=="singlet"]

In [None]:
percentiles = [1, 2.5, 5, 10, 95, 97.5, 99]
p1 = sns.displot(adata.obs["log1p_total_counts"], bins=100, kde=True)
for p in percentiles:
    plt.axvline(np.percentile(adata.obs["log1p_total_counts"], p), color='r', linestyle='--', label=f'{p}th Percentile')


In [None]:
p2 = sns.displot(adata.obs["log1p_n_genes_by_counts"], bins=100, kde=True)
for p in percentiles:
    plt.axvline(np.percentile(adata.obs["log1p_n_genes_by_counts"], p), color='r', linestyle='--', label=f'{p}th Percentile')

In [None]:
p3 = sns.displot(adata.obs["n_genes_by_counts"], bins=100, kde=True)
for p in percentiles:
    plt.axvline(np.percentile(adata.obs["n_genes_by_counts"], p), color='r', linestyle='--', label=f'{p}th Percentile')

In [None]:
percentiles = [50, 90, 95, 97.5, 99]

p4 = sns.displot(adata.obs["pct_counts_mt"], bins=100, kde=True)
for p in percentiles:
    plt.axvline(np.percentile(adata.obs["pct_counts_mt"], p), color='r', linestyle='--', label=f'{p}th Percentile')

$\text{MAD}=\text{Median}(|X_i-\tilde{X}|)$

In [None]:
n_genes_outlier_upper = 97.5
counts_outlier_upper = 97.5
n_genes_outlier_lower = 2.5
counts_outlier_lower = 2.5
mt_outlier = 90


adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", upper=counts_outlier_upper, lower=counts_outlier_lower) | is_outlier(adata, "log1p_n_genes_by_counts", upper=n_genes_outlier_upper, lower=n_genes_outlier_lower))
adata.obs.outlier.value_counts()

In [None]:
adata.obs["mt_outlier"] = ( adata.obs["pct_counts_mt"] > np.percentile(adata.obs["pct_counts_mt"], mt_outlier))
adata.obs.mt_outlier.value_counts()

In [None]:
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

In [None]:
p1 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
from ipylab import JupyterFrontEnd

app = JupyterFrontEnd()
app.commands.execute('docmanager:save')
#saving so html writes properly

In [None]:
adata.write_h5ad("qc_filtered.h5ad")
os.system('jupyter nbconvert --to html QC.ipynb')