# Import Required Packages

In [None]:
import os
import numpy as np
import pandas as pd
import datetime
from pathlib import Path
import scanpy as sc
import re
from pprint import pprint

from natsort import natsorted

from copy import deepcopy

from scipy import stats, linalg

import matplotlib as mpl
import matplotlib.pyplot as plt
import cmocean
import seaborn as sns
from mpl_toolkits import mplot3d
%matplotlib inline  

sc.settings.verbosity = 4

# Read in Cell Ranger Counts Data

In [None]:
fin_folder = Path("/n/groups/COVID/Finlay")
cell_folder = fin_folder / "cell_ranger_output"

outs = sorted(cell_folder.rglob("*filtered_feature_bc_matrix"))
outs

In [None]:
new_adatas = []
for f in tqdm(outs):
    s = f.parents[0].stem.replace("-", "_")
    print(s)
    _adata = sc.read_10x_mtx(f, cache=True)
    _adata.obs["orig_ident"] = s
    _adata.obs_names = s + "_" + _adata.obs_names
    _adata.var = _adata.var.reset_index().set_index('gene_ids')
    new_adatas.append(_adata)

In [None]:
#Concatenate adata objects
new_adata = new_adatas[0].concatenate(new_adatas[1:], index_unique=None, join="outer")
assert (new_adata.obs_names.map(lambda l: "_".join(l.split("_")[:-1])) == new_adata.obs.orig_ident).all()
new_adata.obs_names_make_unique()
print(new_adata)

In [None]:
#Can specify disease condition (ie COVID PASC, normosmic, etc) here
#For example:

new_adata.obs["orig_ident"] = new_adata.obs.orig_ident.apply(lambda s: re.split("COVID_", s)[1])

cond_mapping = {
    "COVID_1": "covid"}

new_adata.obs["cond"] = new_adata.obs.orig_ident.map(cond_mapping)
assert not new_adata.obs.cond.isnull().any()

new_adata.obs.groupby(["cond", "patient"]).orig_ident.value_counts()

adata=new_adatas

# Preprocess Data

In [None]:
#Filter out genes detected in <1 cell
gene_names = adata.var["index-0-0"].copy()
gene_names.name = None
gene_null = gene_names.isnull()
print(gene_null.sum())
gene_names[gene_null] = adata.var["index-1-0"][gene_null]
assert not gene_names.isnull().any()
adata.var = pd.DataFrame(adata.var_names, index=gene_names)
adata.var_names_make_unique()
print(adata)

In [None]:
#Run QC metrics, mainly for genes
adata.var["mito"] = adata.var_names.str.contains("^MT-")
adata.var["ribo"] = adata.var_names.str.contains("^RP[LS]")
adata.var["total_counts"] = adata.X.sum(0).A.flatten()
adata.var["n_cells"] = (adata.X > 0).sum(0).A.flatten()
adata.var['mean_expr'] = adata.X.mean(0).A.flatten()

In [None]:
adata.obs["total_counts"] = adata.X.sum(1).A.flatten()
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])
adata.obs["n_genes"] = (adata.X > 0).sum(1).A.flatten()
adata.obs["pct_counts_mito"] = (
    adata.X[:, adata.var["mito"]].sum(1).A.flatten() / adata.obs["total_counts"] * 100
)

In [None]:
#Plot QC metrics
df = adata.obs
fig, ax = plt.subplots(figsize=(12,8))
sns.boxenplot(data=df, x="orig_ident", hue="cond",
              y="total_counts", dodge=False, ax=ax, order=np.unique(df.orig_ident))
plt.xticks(rotation=90)
ax.legend(loc="upper left", bbox_to_anchor=(1,1))
ax.set_yscale("log")
sns.despine()
fig, ax = plt.subplots(figsize=(12,8))
sns.boxenplot(data=df, x="orig_ident", hue="cond",
              y="pct_counts_mito", dodge=False, ax=ax, order=np.unique(df.orig_ident))
plt.xticks(rotation=90)
ax.legend(loc="upper left", bbox_to_anchor=(1,1))
sns.despine()

In [None]:
adata.layers["counts"] = adata.X.copy()
adata.layers["norm"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4, layer="norm")

In [None]:
#Save anndata object
adata.write('Concat_dataset.h5ad')