In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import infercnvpy as cnv
import matplotlib.pyplot as plt
# make this notebook work better with Scanpy
import warnings; warnings.simplefilter(action='ignore', category=FutureWarning)
import seaborn as sns

In [None]:
# make output directories
import os
if not os.path.exists("scRNA_out"):
    os.mkdir("scRNA_out")

if not os.path.exists("scRNA_out/infercnv/"):
    os.mkdir("scRNA_out/infercnv/")

In [None]:
sc.set_figure_params(transparent=True, dpi_save=400)
sc.settings.figdir = "scRNA_out/infercnv/"

In [None]:
from pyensembl import EnsemblRelease

# release 98 uses human reference genome GRCh38
data = EnsemblRelease(98)

---
# Read in data

In [None]:
vumc_dis_epi = sc.read("../data/scRNA/VUMC_HTAN_DIS_EPI_V2.h5ad"); vumc_dis_epi

In [None]:
broad_epi = sc.read("../data/scRNA/Broad_Epi_CRC_NoNormal_ReFiltered_Counts.h5ad"); broad_epi

In [None]:
vumc_crc_epi = sc.read("../data/scRNA/abnormal_epithelium.h5ad")
vumc_crc_epi = vumc_crc_epi[vumc_crc_epi.obs["Tumor_Type"].isin(["MSI","MSS"]),:].copy()
del vumc_crc_epi.uns
del vumc_crc_epi.var
del vumc_crc_epi.obsm
vumc_crc_epi.obs.drop(columns=["cnv_leiden","CNV Score","CNV Clone"], inplace=True)
vumc_crc_epi.obs["Polyp_Type"] = "CRC"
vumc_crc_epi.obs["Sample_Classification"] = "CRC"
vumc_crc_epi

In [None]:
# combine AnnData objects into master 'a_comb'
a_comb = vumc_dis_epi.concatenate(
    [vumc_crc_epi, broad_epi],
    batch_categories=["VUMC-polyp","VUMC-CRC","BROAD-CRC"],
    fill_value=0,
)

In [None]:
genes = pd.read_csv("../resources/ST/master_visium_genes_list.csv", index_col=0)

In [None]:
a_comb.var = a_comb.var.merge(genes, left_index=True, right_index=True, how="left")

In [None]:
a_comb.var

In [None]:
a_comb.obs.Cell_Type.value_counts()

In [None]:
a_comb.obs.Polyp_Type.value_counts()

In [None]:
a_comb.obs.Sample_Classification.value_counts()

In [None]:
a_comb

---
# Perform CNV inference globally

In [None]:
a_comb.var.gene_id = a_comb.var.gene_id.fillna("")

In [None]:
err_counter=0
for i in a_comb.var_names:
    if a_comb.var.loc[i, "gene_id"] == "":
        try:
            a_comb.var.loc[i, "gene_id"] = data.gene_ids_of_gene_name(i)[0]
        except:
            err_counter += 1
print("{} errors in ENSEMBL mapping".format(err_counter))

In [None]:
a_comb.var

In [None]:
# Use `pyensembl` to map ENSG IDs to genomic loci
a_comb.var["chromosome"] = "0"
a_comb.var["start"] = 0
a_comb.var["end"] = 0
err_counter = 0
for i in a_comb.var_names:
    try:
        gene = data.gene_by_id(a_comb.var.gene_id[i])
        a_comb.var.loc[i,"chromosome"] = "chr{}".format(gene.contig)
        a_comb.var.loc[i,"start"] = gene.start
        a_comb.var.loc[i,"end"] = gene.end
    except ValueError as e:
        #print(e)
        err_counter += 1
print("{} errors in ENSEMBL mapping".format(err_counter))

In [None]:
%%time
# run InferCNV
cnv.tl.infercnv(
    a_comb,
    reference_key="Sample_Classification",
    reference_cat=["NL"],
    window_size=200,
)

In [None]:
# determine clusters based on detected CNVs
%time cnv.tl.pca(a_comb)

In [None]:
%time cnv.pp.neighbors(a_comb, n_neighbors=int(np.sqrt(a_comb.n_obs)))

In [None]:
%time cnv.tl.leiden(a_comb, resolution=1.5)

In [None]:
# score detected CNVs in every spot
%time cnv.tl.cnv_score(a_comb)

In [None]:
# save CNV to file
np.save("scRNA_out/infercnv/uns_cnv_broad_vumc_comb_epi.npy", a_comb.uns["cnv"])
# write to csv for compilation
a_comb.obs[["cnv_leiden","cnv_score"]].to_csv("scRNA_out/infercnv/broad_vumc_comb_epi_cnv_leiden.csv")
# write CNV values to npz for compilation
np.savez_compressed("scRNA_out/infercnv/broad_vumc_comb_epi_cnv.npz", a_comb.obsm["X_cnv"])

In [None]:
# or, read in results from previous run
# read in CNV matrix and put in a.obsm slot
tmp = np.load("scRNA_out/infercnv/broad_vumc_comb_epi_cnv.npz", allow_pickle="TRUE")
a_comb.obsm["X_cnv"] = tmp.f.arr_0.item()
# read in CNV genomic partitions
a_comb.uns["cnv"] = np.load("scRNA_out/infercnv/uns_cnv_broad_vumc_comb_epi.npy", allow_pickle="TRUE").item()
# read in cnv_score and cnv_leiden
a_comb.obs = a_comb.obs.merge(
    pd.read_csv("scRNA_out/infercnv/broad_vumc_comb_epi_cnv_leiden.csv", index_col=0),
    left_index=True,
    right_index=True,
)

---
Look at `CNV Score` across all cells

In [None]:
a_comb.obs.rename(columns={"cnv_score":"CNV Score"}, inplace=True)

In [None]:
a_comb.obs["CNV Score"].min()

In [None]:
a_comb.obs["CNV Score"].max()

In [None]:
a_comb.obs.Sample_Classification = a_comb.obs.Sample_Classification.astype(str)
a_comb.obs.loc[a_comb.obs.Sample_Classification=="UNC", "Sample_Classification"] = "NL"

In [None]:
a_comb.obs.Polyp_Type = a_comb.obs.Polyp_Type.astype(str)
a_comb.obs.loc[a_comb.obs.Polyp_Type=="UNC", "Polyp_Type"] = "NL"

In [None]:
# plot heatmap with cell types
print("Plotting Cell_Type CNV heatmap")
cnv.pl.chromosome_heatmap(
    a_comb,
    groupby="Cell_Type",
    save="_broad_vumc_comb_epi_CellType.png",
    dendrogram=True,
    figsize=(12,8),
)

In [None]:
# plot heatmap with tumor type
print("Plotting CNV heatmap with tumor class")
cnv.pl.chromosome_heatmap(
    a_comb,
    groupby="Sample_Classification",
    save="_broad_vumc_comb_epi_sampleclass.png",
    dendrogram=True,
    figsize=(12,8),
)

---
Stratify CRC samples by MMR status

In [None]:
a_comb.obs["Tumor_Type2"] = a_comb.obs.Sample_Classification.values
a_comb.obs["Tumor_Type2"] = a_comb.obs["Tumor_Type2"].astype(str)
a_comb.obs.loc[a_comb.obs.Tumor_Type == "MSS", "Tumor_Type2"] = "MSS"
a_comb.obs.loc[a_comb.obs.Tumor_Type == "MSI", "Tumor_Type2"] = "MSI-H"
a_comb.obs.loc[a_comb.obs.MMRStatusTumor == "MSS", "Tumor_Type2"] = "MSS"
a_comb.obs.loc[a_comb.obs.MMRStatusTumor == "MSI", "Tumor_Type2"] = "MSI-H"

In [None]:
a_comb.obs.Tumor_Type2 = a_comb.obs.Tumor_Type2.replace({"SER":"SSL/HP","AD":"TA/TVA"})

In [None]:
a_comb.obs.Tumor_Type2.value_counts()

In [None]:
a_comb.obs.drop(columns="Tumor_Type", inplace=True)
a_comb.obs.rename(columns={"Tumor_Type2":"Tumor_Type"}, inplace=True)

In [None]:
a_comb.obs.Tumor_Type = a_comb.obs.Tumor_Type.astype("category")

In [None]:
a_comb.obs["Tumor_Type"].value_counts()

---
Compare all tumor-derived cells to all adjacent normal cells

In [None]:
a_comb.obs["Tumor_vs_NL"] = a_comb.obs["Tumor_Type"].astype(str)
a_comb.obs.loc[a_comb.obs["Tumor_Type"].isin(["MSI-H","MSS"]), "Tumor_vs_NL"] = "CRC"
a_comb.obs["Tumor_vs_NL"] = a_comb.obs["Tumor_vs_NL"].astype("category")

In [None]:
a_comb.obs["Tumor_vs_NL"].value_counts()

In [None]:
# custom color dictionary for tumor types and normals
cmap_dict = {
    # Tumor Type
    'SSL/HP':"#c4a4e1",'MSI-H':"#7a4fa3",'MSS':"#ffc101",'TA/TVA':"#fee799",'NL':"#1f77b4",
    "MSI-H (CIN+)":"#7a4fa3", 'MSS (HM)':"#ffc101",
    # Tumor Location
    "Cecum":"#1f4e79","Ascending":"#2e74b7","Hepatic Flexure":"#bdd6ef","Transverse":"#ff717a","Descending":"#fe0001","Sigmoid":"#c00101",
    # this one's global
    "nan":"#ffffff",
    # These are black and white for T and F
    "T":"#000000","F":"#ffffff",
    # evolution
    "N":"tab:blue","B":"tab:green","L":"tab:orange",
    # CNV clone domain
    "T":"#000000","S":"tab:pink","E":"tab:red",
    "CRC":"#ffffff",
}
stage_colordict = dict(zip(["AD","I","II","III/IV"], sns.color_palette("Reds", len(["AD","I","II","III/IV"])).as_hex()))
grade_colordict = dict(zip(["G1","G2","G3"], sns.color_palette("Reds", len(["G1","G2","G3"])).as_hex()))
CIN_colordict = dict(zip(["HM","CIN-","CIN+"], sns.color_palette("Reds", len(["HM","CIN-","CIN+"])).as_hex()))
cmap_dict = {**cmap_dict, **stage_colordict, **grade_colordict, **CIN_colordict}

In [None]:
a_comb.uns["Tumor_Type_colors"] = [cmap_dict[x] for x in a_comb.obs.Tumor_Type.cat.categories]

In [None]:
a_comb.uns["Tumor_vs_NL_colors"] = [cmap_dict[x] for x in a_comb.obs.Tumor_vs_NL.cat.categories]

In [None]:
# plot heatmap with tumor type
print("Plotting CNV heatmap with tumor class")
cnv.pl.chromosome_heatmap(
    a_comb,
    groupby="Tumor_Type",
    save="_broad_vumc_comb_epi_tumortype.png",
    dendrogram=True,
    figsize=(12,8),
)

In [None]:
# plot heatmap with tumor type
print("Plotting CNV heatmap with tumor class")
cnv.pl.chromosome_heatmap(
    a_comb,
    groupby="Tumor_vs_NL",
    save="_broad_vumc_comb_epi_tumorvsnl.png",
    dendrogram=True,
    figsize=(12,8),
)

---
## Create boxplots of `CNV Score` by tissue type with B-H corrected t-tests for significance

In [None]:
import sys; sys.path.append("../resources/")
from boxplot_utils import *

In [None]:
boxplots_group(
    a_comb,
    outdir="scRNA_out/infercnv/",
    obs=["Tumor_Type"],
    colors=["CNV Score"],
    figsize=(4,5),
    sig=True,
    cmap_dict=cmap_dict,
    titles=["scRNA-seq (all cells)"],
)

In [None]:
boxplots_group(
    a_comb,
    outdir="scRNA_out/infercnv/",
    obs=["Tumor_Type"],
    colors=["CNV Score"],
    figsize=(4,4),
    sig=False,
    cmap_dict=cmap_dict,
    titles=["scRNA-seq (all cells)"],
)

In [None]:
boxplots_group(
    a_comb,
    outdir="scRNA_out/infercnv/",
    obs=["Tumor_vs_NL"],
    colors=["CNV Score"],
    figsize=(4,4),
    sig=False,
    cmap_dict=cmap_dict,
    titles=["scRNA-seq (all cells)"],
)

In [None]:
boxplots_group(
    a_comb,
    outdir="scRNA_out/infercnv/",
    obs=["Tumor_vs_NL"],
    colors=["CNV Score"],
    figsize=(4,6),
    sig=True,
    cmap_dict=cmap_dict,
    titles=["scRNA-seq (all cells)"],
)

---
# calculate _n_ values at specimen level

In [None]:
a_comb.obs["patient_combined"] = a_comb.obs.Patient.astype(str)
a_comb.obs.loc[a_comb.obs['HTAN Specimen ID'].astype(str)!="nan","patient_combined"] = a_comb.obs.loc[a_comb.obs['HTAN Specimen ID'].astype(str)!="nan","HTAN Specimen ID"]
a_comb.obs.loc[a_comb.obs['PatientBarcode'].astype(str)!="nan","patient_combined"] = a_comb.obs.loc[a_comb.obs['PatientBarcode'].astype(str)!="nan","PatientBarcode"]

In [None]:
a_comb.obs.loc[a_comb.obs.patient_combined.str.startswith("C"), ["Tumor_Type","patient_combined"]].drop_duplicates()["Tumor_Type"].value_counts()
# Broad Institute specimens by tumor type

In [None]:
a_comb.obs.loc[~a_comb.obs.patient_combined.str.startswith("C"), ["Tumor_Type","patient_combined"]].drop_duplicates()["Tumor_Type"].value_counts()
# VUMC specimens by tumor type

In [None]:
37+14+11+4+1+32+28