In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import infercnvpy as cnv
import matplotlib.pyplot as plt
# make this notebook work better with Scanpy
import warnings; warnings.simplefilter(action='ignore', category=FutureWarning)
import seaborn as sns

In [None]:
# make output directories
import os
if not os.path.exists("scRNA_out"):
    os.mkdir("scRNA_out")

if not os.path.exists("scRNA_out/gene_signatures/"):
    os.mkdir("scRNA_out/infercnv/")

In [None]:
sc.set_figure_params(transparent=True, dpi_save=400)
sc.settings.figdir = "scRNA_out/infercnv/"

In [None]:
from pyensembl import EnsemblRelease

# release 98 uses human reference genome GRCh38
data = EnsemblRelease(98)

---
# Read in data

In [None]:
a_comb = sc.read("../data/scRNA/VUMC_COMBINED.h5ad"); a_comb

---
# Perform CNV inference globally

In [None]:
# Use `pyensembl` to map ENSG IDs to genomic loci
a_comb.var["chromosome"] = "0"
a_comb.var["start"] = 0
a_comb.var["end"] = 0
err_counter = 0
for i in a_comb.var_names:
    try:
        gene = data.gene_by_id(a_comb.var.gene_id[i])
        a_comb.var.loc[i,"chromosome"] = "chr{}".format(gene.contig)
        a_comb.var.loc[i,"start"] = gene.start
        a_comb.var.loc[i,"end"] = gene.end
    except ValueError as e:
        #print(e)
        err_counter += 1
print("{} errors in ENSEMBL mapping".format(err_counter))

In [None]:
a = a_comb[a_comb.obs.Patient != "", :].copy() ; a

In [None]:
%%time
# run InferCNV
# provide stromal regions as "normal cells"
cnv.tl.infercnv(
    a,
    reference_key="Compartment",
    reference_cat=["Stroma"],
    window_size=200,
)
# determine clusters based on detected CNVs
cnv.tl.pca(a)
cnv.pp.neighbors(a, n_neighbors=int(np.sqrt(a.n_obs)))
cnv.tl.leiden(a, resolution=1.5)
# score detected CNVs in every spot
cnv.tl.cnv_score(a)

In [None]:
# save CNV to file
np.save("scRNA_out/infercnv/uns_cnv_VUMC.npy", a.uns["cnv"])
# write to csv for compilation
a.obs[["cnv_leiden","cnv_score"]].to_csv("scRNA_out/infercnv/VUMC_cnv_leiden.csv")
# write CNV values to npz for compilation
np.savez_compressed("scRNA_out/infercnv/VUMC_cnv.npz", a.obsm["X_cnv"])

In [None]:
# or, read in results from previous run
# read in CNV matrix and put in a.obsm slot
tmp = np.load("scRNA_out/infercnv/VUMC_cnv.npz", allow_pickle="TRUE")
a.obsm["X_cnv"] = tmp.f.arr_0.item()
# read in CNV genomic partitions
a.uns["cnv"] = np.load("scRNA_out/infercnv/uns_cnv_VUMC.npy", allow_pickle="TRUE").item()
# read in cnv_score and cnv_leiden
a.obs = a.obs.merge(pd.read_csv("scRNA_out/infercnv/VUMC_cnv_leiden.csv", index_col=0), left_index=True, right_index=True)

In [None]:
# plot heatmap with cell types
print("Plotting Cell_Type CNV heatmap")
cnv.pl.chromosome_heatmap(
    a,
    groupby="Compartment",
    save="_VUMC_compartment.png",
    dendrogram=True,
    figsize=(12,8),
)

In [None]:
# plot heatmap with cell types
print("Plotting Cell_Type CNV heatmap")
cnv.pl.chromosome_heatmap(
    a,
    groupby="Cell_Type",
    save="_VUMC_CellType.png",
    dendrogram=True,
    figsize=(12,8),
)

In [None]:
# plot heatmap with tumor type
print("Plotting CNV heatmap with tumor class")
cnv.pl.chromosome_heatmap(
    a,
    groupby="Tumor_Type",
    save="_VUMC_tumortype.png",
    dendrogram=True,
    figsize=(12,8),
)

In [None]:
# plot heatmap with patient
print("Plotting CNV heatmap with patient")
cnv.pl.chromosome_heatmap(
    a,
    groupby="Patient",
    save="_VUMC_patient.png",
    dendrogram=True,
    figsize=(12,12),
)

In [None]:
# create cnv_leiden colordict
a.obs.cnv_leiden = a.obs.cnv_leiden.astype(str)
a.obs.cnv_leiden = a.obs.cnv_leiden.astype("category")
cnv_leiden_cdict = dict(zip(a.obs.cnv_leiden.cat.categories, sns.color_palette("tab20", len(a.obs.cnv_leiden.cat.categories)).as_hex()))
a.uns["cnv_leiden_colors"] = [cnv_leiden_cdict[x] for x in a.obs.cnv_leiden.cat.categories]

In [None]:
# plot heatmap with CNV Leiden clusters
print("Plotting CNV heatmap with Leiden clusters")
cnv.pl.chromosome_heatmap(
    a,
    groupby="cnv_leiden",
    save="_VUMC.png",
    dendrogram=True,
    figsize=(12,8),
)

In [None]:
# plot heatmap with CNV Leiden clusters within each patient cohort
for pat in list(specs.keys()):
    # plot heatmap with patient
    print("Plotting CNV heatmap for {}".format(pat), end=" - ")
    tmp = a[a.obs["Patient"]==pat,:].copy()
    tmp.obs.cnv_leiden = tmp.obs.cnv_leiden.astype(str)
    tmp.obs.cnv_leiden = tmp.obs.cnv_leiden.astype("category")
    tmp.uns["cnv_leiden_colors"] = [cnv_leiden_cdict[x] for x in tmp.obs.cnv_leiden.cat.categories]
    tt = tmp.obs["Tumor_Type"].unique()[0]
    print(tt)
    cnv.pl.chromosome_heatmap(
        tmp,
        groupby="cnv_leiden",
        save="_VUMC_scRNA_{}_{}.png".format(pat, tt.replace("/","-")),
        dendrogram=True,
        figsize=(12,8),
    )

In [None]:
a.obs.rename(columns={"cnv_score":"CNV Score"}, inplace=True)

---
# Save CNV info to anndatas

In [None]:
epi_nl = sc.read("../data/scRNA/VUMC_HTAN_DIS_EPI_V2.h5ad")

In [None]:
stroma = sc.read("../data/scRNA/VUMC_HTAN_VAL_DIS_NONEPI_V2.h5ad")

In [None]:
epi = sc.read("../data/scRNA/abnormal_epithelium.h5ad")

### Add CNV metadata to UMAP coord anndatas

In [None]:
epi.obs["Patient"]=np.nan; epi.obs["Tumor_Type"]=np.nan; epi.obs["cnv_leiden"]=np.nan; epi.obs["CNV Score"]=np.nan

epi.obs.loc[
    list(set(a.obs_names).intersection(set(epi.obs_names))),
    ["Patient","Tumor_Type","cnv_leiden","CNV Score"]
] = a.obs.loc[
    list(set(a.obs_names).intersection(set(epi.obs_names))),
    ["Patient","Tumor_Type","cnv_leiden","CNV Score"]
].values

In [None]:
epi_nl.obs["Patient"]=np.nan; epi_nl.obs["Tumor_Type"]=np.nan; epi_nl.obs["cnv_leiden"]=np.nan; epi_nl.obs["CNV Score"]=np.nan

epi_nl.obs.loc[
    list(set(a.obs_names).intersection(set(epi_nl.obs_names))),
    ["Patient","Tumor_Type","cnv_leiden","CNV Score"]
] = a.obs.loc[
    list(set(a.obs_names).intersection(set(epi_nl.obs_names))),
    ["Patient","Tumor_Type","cnv_leiden","CNV Score"]
].values

In [None]:
stroma.obs["Patient"]=np.nan; stroma.obs["Tumor_Type"]=np.nan; stroma.obs["cnv_leiden"]=np.nan; stroma.obs["CNV Score"]=np.nan

stroma.obs.loc[
    list(set(a.obs_names).intersection(set(stroma.obs_names))),
    ["Patient","Tumor_Type","cnv_leiden","CNV Score"]
] = a.obs.loc[
    list(set(a.obs_names).intersection(set(stroma.obs_names))),
    ["Patient","Tumor_Type","cnv_leiden","CNV Score"]
].values

In [None]:
epi_nl.write("../data/scRNA/VUMC_HTAN_DIS_EPI_V2.h5ad", compression="gzip")

In [None]:
stroma.write("../data/scRNA/VUMC_HTAN_VAL_DIS_NONEPI_V2.h5ad", compression="gzip")

In [None]:
epi.write("../data/scRNA/abnormal_epithelium.h5ad", compression="gzip")