In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
import infercnvpy as cnv
import matplotlib.pyplot as plt
# make this notebook work better with Scanpy
import warnings; warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from pyensembl import EnsemblRelease

# release 98 uses human reference genome GRCh38
data = EnsemblRelease(98)

In [None]:
# make output directories
import os
if not os.path.exists("ST_out"):
    os.mkdir("ST_out")

if not os.path.exists("ST_out/infercnv/"):
    os.mkdir("ST_out/infercnv/")

In [None]:
sc.set_figure_params(transparent=True, dpi_save=180)
sc.settings.figdir = "ST_out/infercnv/"

---
### Read in key dataframe with sample information

In [None]:
sample_key = pd.read_csv("../resources/ST/visium_sample_key.csv", index_col=0)

---
### Perform CNV inference by patient

In [None]:
sample_key.CNV_group.unique()

In [None]:
for pat in sample_key.CNV_group.unique():
    try:
        print("Starting on patient {}".format(pat))
        key_tmp = sample_key.loc[sample_key.CNV_group == pat, :].copy()
        outs = []
        for s in key_tmp.index:
            a = sc.read("../data/ST/{}_master.h5ad".format(s))
            print("Read adata from ../data/ST/{}_master.h5ad".format(s))
            outs.append(a)
        # concatenate anndata objects
        a_comb = outs[0].concatenate(
            outs[1:],
            join="outer",
            batch_categories=list(key_tmp.index),
            fill_value=0,
        )
        del a_comb.obsm
        
        # set up for InferCNV
        a_comb.var.drop(columns=a_comb.var.columns[a_comb.var.columns.str.startswith("gene_ids")][1:], inplace=True)
        a_comb.var.rename(columns={a_comb.var.columns[a_comb.var.columns.str.startswith("gene_ids")][0]:"gene_ids"}, inplace=True)
        # Use `pyensembl` to map ENSG IDs to genomic loci
        a_comb.var["chromosome"] = "0"
        a_comb.var["start"] = 0
        a_comb.var["end"] = 0
        err_counter = 0
        for i in a.var_names:
            try:
                gene = data.gene_by_id(a_comb.var.gene_ids[i])
                a_comb.var.loc[i,"chromosome"] = "chr{}".format(gene.contig)
                a_comb.var.loc[i,"start"] = gene.start
                a_comb.var.loc[i,"end"] = gene.end
            except ValueError as e:
                #print(e)
                err_counter += 1
        print("{} errors in ENSEMBL mapping".format(err_counter))
        
        # run InferCNV
        # provide stromal regions as "normal cells"
        cnv.tl.infercnv(
            a_comb,
            reference_key="pathology_annotation",
            reference_cat=list(set(["normal_mucosa","smooth_muscle"]).intersection(set(a_comb.obs.pathology_annotation.unique()))),
            window_size=200,
        )
        # determine clusters based on detected CNVs
        cnv.tl.pca(a_comb)
        cnv.pp.neighbors(a_comb, n_neighbors=int(np.sqrt(a_comb.n_obs)))
        cnv.tl.leiden(a_comb, resolution=1.0)
        # score detected CNVs in every spot
        cnv.tl.cnv_score(a_comb)
        # get rid of noisy normal clusters
        m = {
            "B" :
            list(a_comb.obs.loc[a_comb.obs.cnv_score <= a_comb.obs.cnv_score.quantile([.33]).values[0], "cnv_leiden"].unique())
        }
        m2 = {v: k for k,vv in m.items() for v in vv}
        a_comb.obs.cnv_leiden = a_comb.obs.cnv_leiden.replace(m2)
        
        # plotting
        print("Plotting pathology_annotation CNV heatmap")
        
        a_comb.obs["pathology_annotation"] = a_comb.obs["pathology_annotation"].astype(str)
        a_comb.obs["pathology_annotation"] = a_comb.obs["pathology_annotation"].fillna("smooth_muscle")
        a_comb.obs["pathology_annotation"] = a_comb.obs["pathology_annotation"].astype("category")
        # rename pathology_annotation categories and get proper colors
        path_dict = {
            "carcinoma_edge":"carcinoma_border",
            "UNK":"carcinoma",
            "necrotic":"carcinoma",
            "adipose":"smooth_muscle",
            "nan":"smooth_muscle",
        }
        a_comb.obs["pathology_annotation"].replace(path_dict, inplace=True)
        path_colordict = dict(zip(
            ["adenoma","adenoma_border","carcinoma","carcinoma_border","lymphoid_follicle","normal_mucosa","smooth_muscle"],
            sns.color_palette("tab10",7).as_hex()
        ))
        a_comb.uns["pathology_annotation_colors"] = [path_colordict[x] for x in a_comb.obs["pathology_annotation"].cat.categories]
        
        # plot heatmap with tissue domains
        cnv.pl.chromosome_heatmap(
            a_comb,
            groupby="pathology_annotation",
            save="_pathology_annotation_{}.png".format(pat),
            dendrogram=True,
            figsize=(12,8),
        )
        if len(key_tmp.patient_name.unique()) == 1:
            if key_tmp.patient_name.unique()[0] == "HTA11_08622":
                # plot heatmap with WES ROIs
                print("Plotting CNV heatmap by block (HTA11_08622 synchronous polyps)")
                cnv.pl.chromosome_heatmap(
                    a_comb,
                    groupby="Block ID",
                    save="_block_{}.png".format(pat),
                    dendrogram=True,
                    figsize=(12,8),
                )
            else:
                if "LCM_ROI" in a_comb.obs.columns:
                    # plot heatmap with WES ROIs
                    print("Plotting LCM_ROI CNV heatmap")
                    a_comb.obs.LCM_ROI = a_comb.obs.LCM_ROI.astype(str)
                    cnv.pl.chromosome_heatmap(
                        a_comb[a_comb.obs.LCM_ROI != "nan", :],
                        groupby="LCM_ROI",
                        save="_LCM_ROI_{}.png".format(pat),
                        dendrogram=True,
                        figsize=(12,8),
                    )
            # plot heatmap with CNV Leiden clusters
            print("Plotting CNV heatmap with Leiden clusters")
            cnv.pl.chromosome_heatmap(
                a_comb,
                groupby="cnv_leiden",
                save="_{}.png".format(pat),
                dendrogram=True,
                figsize=(12,8),
            )
        elif len(key_tmp.patient_name.unique()) > 1:
            # plot heatmap with WES ROIs
            print("Plotting CNV heatmap by patient (for grouped samples)")
            cnv.pl.chromosome_heatmap(
                a_comb,
                groupby="Patient",
                save="_patient_{}.png".format(pat),
                dendrogram=True,
                figsize=(12,8),
            )
            for pat2 in key_tmp.patient_name.unique():
                # plot heatmap with CNV Leiden clusters
                print("Plotting CNV heatmap with Leiden clusters for {}".format(pat2))
                cnv.pl.chromosome_heatmap(
                    a_comb[a_comb.obs.Patient == pat2, :],
                    groupby="cnv_leiden",
                    save="_{}_{}.png".format(pat, pat2),
                    dendrogram=True,
                    figsize=(12,8),
                )
                
        # rename MILWRM domain and ensure proper coloring
        #a_comb.obs.rename(columns={"VUMCrefNMF30_MILWRM_domain":"MILWRM Domain"}, inplace=True)
        a_comb.obs["MILWRM Domain"] = a_comb.obs["MILWRM Domain"].astype("category")
        mw_colordict = dict(zip(["D0","D1","D2","D3","D4","D5","D6","D7"], sns.color_palette("plasma",8).as_hex()))
        a_comb.uns["MILWRM Domain_colors"] = [mw_colordict[x] for x in a_comb.obs["MILWRM Domain"].cat.categories]
        
        # plot PCA of patient/group
        print("Plotting CNV PCA")
        sc.pl.embedding(
            a_comb,
            basis="X_cnv_pca",
            color=["pathology_annotation", "MILWRM Domain", "cnv_leiden"],
            save="_{}.png".format(pat),
            ncols=1,
            frameon=False,
        )
        
        a_comb.obs.cnv_leiden = a_comb.obs.cnv_leiden.astype("category")
        # save CNV cluster colors
        cnv_leiden_colordict = dict(zip(list(a_comb.obs.cnv_leiden.cat.categories), a_comb.uns["cnv_leiden_colors"]))
        np.save("ST_out/infercnv/cnv_leiden_colors_{}.npy".format(pat), cnv_leiden_colordict)
        
        # save CNV genome partitioning for patient to file
        np.save("ST_out/infercnv/uns_cnv_{}.npy".format(pat), a_comb.uns["cnv"])
        
        # plot on Visium scaffolds
        print("Plotting spatial overlays")
        for i, out in enumerate(outs):
            # write to csv for compilation
            tmp_obs = a_comb.obs.loc[a_comb.obs.batch == list(key_tmp.index)[i], ["cnv_leiden","cnv_score"]].copy()
            tmp_obs.index = out.obs_names
            tmp_obs.to_csv("ST_out/infercnv/{}_cnv_leiden.csv".format(list(key_tmp.index)[i]))
            
            # write CNV values to npz for compilation
            np.savez_compressed("ST_out/infercnv/{}_cnv.npz".format(list(key_tmp.index)[i]), a_comb[a_comb.obs.batch == list(key_tmp.index)[i], :].obsm["X_cnv"])
            
            # add cnv metadata to individual sample anndata
            out.obs[["cnv_leiden","cnv_score"]] = a_comb.obs.loc[a_comb.obs.batch == list(key_tmp.index)[i], ["cnv_leiden","cnv_score"]].values
            out.obs.cnv_score = out.obs.cnv_score.astype(float)
            out.obs.cnv_leiden = out.obs.cnv_leiden.astype("category")
            
            # rename pathology_annotation categories and get proper colors
            out.obs["pathology_annotation"].replace(path_dict, inplace=True)
            out.uns["pathology_annotation_colors"] = [path_colordict[x] for x in out.obs["pathology_annotation"].cat.categories]
            
            # transfer cnv_leiden colors to out
            out.uns["cnv_leiden_colors"] = [cnv_leiden_colordict[x] for x in out.obs.cnv_leiden.cat.categories]
            
            # plot spatial
            sc.pl.spatial(
                out,
                color=["cnv_score","cnv_leiden","pathology_annotation"],
                size=1.7,
                ncols=3,
                img_key=None,
                frameon=False,
                vmin=0.0,
                vmax=a_comb.obs.cnv_score.max(),
                cmap="viridis",
                save="_{}_{}_CNV.png".format(pat, list(key_tmp.index)[i]),
            )
    except ValueError as e:
        print(e)

---
## Get `infercnv` outputs to add to master AnnDatas

In [None]:
for s in sample_key.loc[sample_key.patient_name.isin(sample_key.patient_name.unique()), :].index:
    print("Starting {}:".format(s), end="\n\t")
    a = sc.read("../data/ST/{}_master.h5ad".format(s))
    
    # drop old CNV column(s)
    a.obs.drop(columns=a.obs.columns[a.obs.columns.str.startswith("cnv_leiden")], inplace=True)
    a.obs.drop(columns=a.obs.columns[a.obs.columns.str.startswith("cnv_score")], inplace=True)
    if "CNV Clone" in a.obs.columns:
        a.obs.drop(columns=["CNV Clone"], inplace=True)
    if "CNV Score" in a.obs.columns:
        a.obs.drop(columns=["CNV Score"], inplace=True)
        
    # get InferCNV outputs
    cnv = pd.read_csv("ST_out/infercnv/{}_cnv_leiden.csv".format(s), index_col=0)
    print("Read InferCNV outputs from ST_out/infercnv/{}_cnv_leiden.csv".format(s))
    a.obs = a.obs.merge(cnv, left_index=True, right_index=True)
    
    # loading uns dict back in from .npy file
    a.obs.cnv_leiden = a.obs.cnv_leiden.astype(str)
    a.obs.cnv_leiden = a.obs.cnv_leiden.astype("category")
    cnv_leiden_colordict = np.load("ST_out/infercnv/cnv_leiden_colors_{}.npy".format(sample_key.loc[s, "CNV_group"]), allow_pickle='TRUE').item()
    a.uns["cnv_leiden_colors"] = [cnv_leiden_colordict[x] for x in a.obs.cnv_leiden.cat.categories]
    
    # save to master anndata object
    print("\tSaving to ../data/ST/{}_master.h5ad".format(s), end="\n\n")
    a.write("../data/ST/{}_master.h5ad".format(s), compression="gzip")