## Combined Preprocessing + Integration

In [None]:
# load packages
import sys
import scanpy as sc
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scvi
import seaborn as sns
import harmonypy
#import scgen

In [None]:
sc.__version__

In [None]:
# load R interface
import anndata2ri
import logging

import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
# import helper functions
import helper_functions

In [None]:
# set up figure parameters
plt.rcParams['figure.figsize'] = (6, 4)
sc.settings.verbosity = 0
sc.settings.set_figure_params(
    dpi=300,
    facecolor="white",
    frameon=False,
)

In [None]:
# Set the default figure size
%config InlineBackend.figure_format = 'retina'  # Use 'retina' for high-resolution figures
%config InlineBackend.rc = {'figure.figsize': (6.0, 4.0)}  # Set the size of figures (width, height)

In [None]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set up dirs
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"
fig_dir = os.path.join(work_dir, "figures", "combined", "main_analysis", "final_figures/")
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "main_analysis", "final_figures/")
sc.set_figure_params(dpi=120, dpi_save=300, format='png')

In [None]:
# import data
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "Combined_SCR_CO2_annotated_2.0_TCR_14-02-24.h5ad"))

### Standardize Metadata

Here I standardize the annotation performed in a patient-specific manner. Also correct for variable values that differ across patients such as timepoint and patient. Create subproject variable: Project + Patient + Timepoint. 

In [None]:
# standardize annotations
adata.obs["Annotation_2.0"].unique()

In [None]:
adata.obs["Annotation_2.0"] = adata.obs["Annotation_2.0"].replace({"B cells": "B Cells"})
adata.obs["Annotation_2.0"] = adata.obs["Annotation_2.0"].replace({"Plasma cells": "Plasma Cells"})
adata.obs["Annotation_2.0"] = adata.obs["Annotation_2.0"].replace({"MAST": "Mast"})

In [None]:
adata.obs["Annotation_1.0"].unique()

In [None]:
adata.obs["Annotation_1.0"] = adata.obs["Annotation_1.0"].replace({"B cells": "B Cells"})
adata.obs["Annotation_1.0"] = adata.obs["Annotation_1.0"].replace({"T cells": "T Cells"})
adata.obs["Annotation_1.0"] = adata.obs["Annotation_1.0"].replace({"Plasma cells": "Plasma Cells"})

In [None]:
# adapt format 
adata.obs["Annotation_2.0"]=adata.obs["Annotation_2.0"].astype("category")
adata.obs["Annotation_1.0"]=adata.obs["Annotation_1.0"].astype("category")
adata.obs["sample"]=adata.obs["sample"].astype("category")

In [None]:
adata.__dict__["_raw"].__dict__["_var"] = adata.__dict__["_raw"].__dict__["_var"].rename(columns={"_index": "features"})

In [None]:
# standardize some metadata columns
adata.obs["patient"] = adata.obs["patient"].replace({"P08": "08"})
adata.obs["timepoint"] = adata.obs["timepoint"].replace({"C2": "C02"})

In [None]:
# create subproject column
adata.obs["subproject"] = adata.obs["project"].astype(str) + "_P" + adata.obs["patient"].astype(str) + "_" + adata.obs["timepoint"].astype(str)
adata.obs["subproject"]=adata.obs["subproject"].astype("category")

In [None]:
# clean object (remove uninformative metadata columns)
print(adata.obs.columns)
adata.obs.drop(columns=['RNA_snn_res.0.8', 'seurat_clusters', 'RNA_snn_res.0.1',
       'cluster_res0.1', 'RNA_snn_res.0.25', 'cluster_res0.25',
       'RNA_snn_res.0.5', 'cluster_res0.5', 'RNA_snn_res.0.75',
       'cluster_res0.75', 'RNA_snn_res.1', 'cluster_res1', 'RNA_snn_res.1.2',
       'cluster_res1.2', 'RNA_snn_res.1.4', 'cluster_res1.4',
       'RNA_snn_res.1.5', 'cluster_res1.5', 'RNA_snn_res.1.6',
       'cluster_res1.6', 'RNA_snn_res.1.8', 'cluster_res1.8', 'RNA_snn_res.2',
       'cluster_res2', 'RNA_snn_res.2.2', 'cluster_res2.2', 'RNA_snn_res.2.4',
       'cluster_res2.4', 'RNA_snn_res.2.6', 'cluster_res2.6',
       'RNA_snn_res.2.7', 'cluster_res2.7', 'RNA_snn_res.2.8',
       'cluster_res2.8', 'RNA_snn_res.3', 'cluster_res3', 
       'nCount_RNA', 'nFeature_RNA', 'percent.mt'], inplace=True)

In [None]:
print(adata.obs.columns)

In [None]:
adata.shape

In [None]:
adata.X = adata.raw.X

In [None]:
# save adata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_raw_12-03-24.h5ad"))

## Preprocessing

In [None]:
# read anndata object
adata_pp = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_raw_12-03-24.h5ad"))

In [None]:
adata_pp.shape

In [None]:
# number of cells per sample after QC filtering
adata_pp.obs.groupby('sample').size()

In [None]:
adata_pp.obs

### 01. Refiltering - Quality Control

Remove cells classified as noise during the patient-specific individual preprocessing and annotation. 

In [None]:
# remove NOISE clusters from previous patient-specific cell type annotation
adata_pp = adata_pp[adata_pp.obs["Annotation_2.0"] != "NOISE"]

In [None]:
adata_pp.shape

In [None]:
# number of cells per sample after remove noise clusters
adata_pp.obs.groupby('sample').size()

In [None]:
# store raw counts in its layer (scVI requirement)
adata_pp.layers["rawcounts"] = adata_pp.raw.X

In [None]:
# mitochondrial genes
adata_pp.var["mt"] = adata_pp.var_names.str.startswith("MT-")
# ribosomal genes
adata_pp.var["ribo"] = adata_pp.var_names.str.startswith(("RPS", "RPL"))

# compute some metrics
sc.pp.calculate_qc_metrics(
    adata_pp, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True
)

In [None]:
adata_pp

In [None]:
print("%MT Threshold: ", str(max(adata_pp.obs.pct_counts_mt)))
print("Max UMI counts Threshold: ", str(max(adata_pp.obs.total_counts)))
print("Min UMI counts Threshold: ", str(min(adata_pp.obs.total_counts)))
print("Max num. genes Threshold: ", str(max(adata_pp.obs.n_genes_by_counts)))
print("Min num. genes Threshold: ", str(min(adata_pp.obs.n_genes_by_counts)))

In [None]:
sc.pl.violin(
    adata_pp,
    groupby="patient",
    keys=["pct_counts_mt", "pct_counts_ribo"],
    jitter=0.4,
    multi_panel=True,
    rotation=True
)

### 02. Normalization

In [None]:
# observe counts distribution
plt.figure(figsize=(6.0, 4.0))
p1 = sns.histplot(adata_pp.obs["total_counts"], bins=100, kde=False)

In [None]:
# normalization
sc.pp.normalize_total(adata_pp, target_sum=1e4)

In [None]:
# log-transform the data
sc.pp.log1p(adata_pp)

In [None]:
# store log counts 
adata_pp.layers["logcounts"] = adata_pp.X

In [None]:
# visualize shifted logarithm distribution 
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(adata_pp.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(adata_pp.layers["logcounts"].sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("lognormalized)
plt.show()

In [None]:
# function to calculate variances on *sparse* matrix
def vars(a, axis=None):
    """ Variance of sparse matrix a
    var = mean(a**2) - mean(a)**2
    """
    a_squared = a.copy()
    a_squared.data **= 2
    return a_squared.mean(axis) - np.square(a.mean(axis))

means = np.mean(adata_pp.raw.X, axis=0)
variances = vars(adata_pp.raw.X, axis=0)
dispersions = variances / means

In [None]:
adata_pp.var["means"] = means.tolist()[0]
adata_pp.var["variances"] = variances.tolist()[0]
adata_pp.var["dispersions"] = dispersions.tolist()[0]
adata_pp.var["log_means"] = np.log1p(means.tolist())[0]
adata_pp.var["log_variances"] = np.log1p(variances.tolist())[0]
adata_pp.var["log_dispersions"] = np.log1p(dispersions.tolist())[0]

In [None]:
min_mean = 0.0125 #default

In [None]:
# mean-dispersion distribution
ax = sns.scatterplot(
    data=adata_pp.var, x="log_means", y="log_dispersions", s=5
)
plt.vlines(x=np.log1p(min_mean),ymin=0,ymax=10,color='red')
plt.show()

### 03. Compute Highly Variable Genes

I have computed the Highly Variable Genes by using different approaches to see which one works the best. I've used the 3 scanpy flavors: Cell Ranger, Seurat and Seurat v3 with their default parameters except for the minimum variance parameter that I have customized based on the dispersion vs. mean distribution when using seurat and cell ranger. Then I have fitted a model on the genes mean-to-variance ratio to determine the HVGs, it is a FDR-based approach. Additionally, I have discarded genes that I don't want them to influence the PCA such as TCR, BCR and MT genes. 

In [None]:
# identify highly variable genes (per sample strategy)
# compute HVG for the three possible flavors
hvg_cell_ranger = sc.pp.highly_variable_genes(adata_pp, batch_key="sample", flavor="cell_ranger", n_top_genes=2000, min_mean=min_mean, subset = False, inplace = False)
hvg_seurat = sc.pp.highly_variable_genes(adata_pp, batch_key="sample", flavor="seurat", n_top_genes=2000, min_mean=min_mean, subset = False, inplace = False)
hvg_seurat_v3 = sc.pp.highly_variable_genes(adata_pp, batch_key="sample", flavor="seurat_v3", n_top_genes=2000, subset = False, inplace = False)

In [None]:
# store them in adata.var
adata_pp.var["HVG_cell_ranger"] = hvg_cell_ranger.highly_variable
adata_pp.var["HVG_seurat"] = hvg_seurat.highly_variable
adata_pp.var["HVG_seurat_v3"] = hvg_seurat_v3.highly_variable

In [None]:
print("Number of HVGs - Cell Ranger: " + str(len(adata_pp.var.loc[adata_pp.var["HVG_cell_ranger"] == True])))
print("Number of HVGs - Seurat: " + str(len(adata_pp.var.loc[adata_pp.var["HVG_seurat"] == True])))
print("Number of HVGs - Seurat v3: " + str(len(adata_pp.var.loc[adata_pp.var["HVG_seurat_v3"] == True])))

In [None]:
# check overlap 
HVG1 = adata_pp.var.loc[adata_pp.var["HVG_cell_ranger"] == True].index
HVG2 = adata_pp.var.loc[adata_pp.var["HVG_seurat"] == True].index
HVG3 = adata_pp.var.loc[adata_pp.var["HVG_seurat_v3"] == True].index

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10, 5))
p1 = sns.scatterplot(
    data=adata_pp.var, x="log_means", y="log_dispersions", hue="HVG_cell_ranger", s=5, ax=axes[0])
p2 = sns.scatterplot(
    data=adata_pp.var, x="log_means", y="log_dispersions", hue="HVG_seurat", s=5, ax=axes[1])
p3 = sns.scatterplot(
    data=adata_pp.var, x="log_means", y="log_dispersions", hue="HVG_seurat_v3", s=5, ax=axes[2])
plt.show()

In [None]:
# move from python to R
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
%%R -i adata_pp -o dec_adata_pp

# load Rlibraries
library(scran)
#library(seurat)

# model gene variation
dec.adata_pp <- modelGeneVar(adata_pp)

# visualizing the fit:
fit.adata_pp <- metadata(dec.adata_pp)
plot(fit.adata_pp$mean, fit.adata_pp$var, xlab="Mean of log-expression",
    ylab="Variance of log-expression")
curve(fit.adata_pp$trend(x), col="dodgerblue", add=TRUE, lwd=2)

# ordering by most interesting genes for inspection.
dec_adata_pp <- dec.adata_pp[order(dec.adata_pp$bio, decreasing=TRUE),]

# select top 2000 genes
HVG4 <- getTopHVGs(dec_adata_pp, n=2000)
str(HVG4)

## identify the 10 most highly variable genes
#top10 <- head(hvg.adata_pp.var, 10)

## plot variable features with and without labels
#plot1 <- VariableFeaturePlot(as.Seurat(adata_pp))
#plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
#plot1 + plot2

In [None]:
dec_adata_pp

In [None]:
HVG4 = dec_adata_pp.index[:2000]

In [None]:
len( set(HVG1) & set(HVG2) & set(HVG3) & set(HVG4) ) # intersection

In [None]:
'''
# visualize overlap
from matplotlib_venn import venn3, venn3_circles 
from matplotlib import pyplot as plt 
  
# depict venn diagram 
venn3(subsets=(len( set(HVG1) - (set(HVG2) | set(HVG3)) ), 
               len( set(HVG2) - (set(HVG1) | set(HVG3)) ), 
               len( set(HVG1) & set(HVG2) ), 
               len( set(HVG3) - (set(HVG2) | set(HVG1)) ), 
               len( set(HVG1) & set(HVG3) ), 
               len( set(HVG2) & set(HVG3) ), 
               len( set(HVG1) & set(HVG2) & set(HVG3) )
              ),  
      set_labels=("Cell Ranger", "Seurat", "Seurat v3", "Model"),  
      set_colors=("orange", "blue", "red", "green"), alpha=0.7) 
  
# outline of circle line style and width 
venn3_circles(subsets=(
               len( set(HVG1) - (set(HVG2) | set(HVG3)) ), 
               len( set(HVG2) - (set(HVG1) | set(HVG3)) ), 
               len( set(HVG1) & set(HVG2) ), 
               len( set(HVG3) - (set(HVG2) | set(HVG1)) ), 
               len( set(HVG1) & set(HVG3) ), 
               len( set(HVG2) & set(HVG3) ), 
               len( set(HVG1) & set(HVG2) & set(HVG3) )
                ), 
              linestyle="dashed", linewidth=2) 
  
# title of the venn diagram 
plt.title("HVG Flavor Overlap") 
plt.show()
'''

In [None]:
# visualize overlap
from venny4py.venny4py import *

#dict of sets
sets = {
    'Cell Ranger': set(HVG1),
    'Seurat': set(HVG2),
    'Seurat v3': set(HVG3),
    'Model': set(HVG4)}
    
venny4py(sets=sets)

In [None]:
len( set(HVG1) & set(HVG4) ) # intersection between Seurat and Model

In [None]:
# put in var the model HVG
adata_pp.var["HVG_model"] = [False] * len(adata_pp.var_names)
for gene in adata_pp.var_names:
    if gene in HVG2:
        adata_pp.var.loc[gene, "HVG_model"] = True
        
        
print(adata_pp.var.HVG_model.value_counts())

In [None]:
# choose HVGs obtained from modelling the variance of each gene as it is FDR based
adata_pp.var["highly_variable"] = adata_pp.var["HVG_model"]
print(adata_pp.var.highly_variable.value_counts())

In [None]:
# select TCR genes 
TCR_prefixes = ["TRAC", "TRAJ", "TRAV",  "TRBC", "TRBD", "TRBJ", "TRBV",  "TRDC", "TRDD", "TRDJ", "TRDV",  "TRBC", "TRG", "TRGJ", "TRGV"]
TCR_genes = [gene_name for gene_name in adata_pp.var_names if any(gene_name.startswith(prefix) for prefix in TCR_prefixes)]
print(TCR_genes[:10])

In [None]:
# select BCR genes 
BCR_prefixes = ["IGHC", "IGHD", "IGHJ", "IGHV",  "IGIC", "IGIJ", "IGIV",  "IGKC", "IGKJ", "IGKV", "IGLC", "IGLJ", "IGLV"]
BCR_genes = [gene_name for gene_name in adata_pp.var_names if any(gene_name.startswith(prefix) for prefix in BCR_prefixes)]
print(BCR_genes[:10])

In [None]:
# select mitochondrial genes
MT_genes = [gene_name for gene_name in adata_pp.var_names if gene_name.startswith("MT-")]
print(MT_genes[:10])

In [None]:
# select ribosomal genes
RP_genes = [gene_name for gene_name in adata_pp.var_names if any(gene_name.startswith(prefix) for prefix in ["RPS", "RPL"])]
print(RP_genes[:10])

In [None]:
# merge unwanted genes
out_genes = TCR_genes + BCR_genes + MT_genes #+ RP_genes #(include RP genes as there is a non-random fashion across cells) #NO MT or RP genes found in HVG

In [None]:
# check there are matches
len(set(adata_pp.var_names) & set(out_genes)) > 0

In [None]:
# create boolean list indicating whether the genes are excluded as HVG
in_out_genes = [False] * len(adata_pp.var_names)
for i in range(0, len(adata_pp.var_names)-1):
    gene = adata_pp.var_names[i]
    if gene in out_genes:
        in_out_genes[i] = True

In [None]:
adata_pp.var["excl_hv"] = in_out_genes

In [None]:
len(out_genes)

In [None]:
adata_pp.var.excl_hv.value_counts()

In [None]:
adata_pp.shape

In [None]:
adata_pp.var["orig_highly_variable"] = adata_pp.var["highly_variable"]

In [None]:
# remove undesired genes as highly variable
for i in range(0, len(adata_pp.var_names)-1):
    gene = adata_pp.var_names[i]
    if ( (adata_pp.var.loc[gene, "orig_highly_variable"] == True) & (adata_pp.var.loc[gene, "excl_hv"] == True) ):
        print(gene + " found in HVG -- removed!")
        adata_pp.var.loc[gene, "highly_variable"] = False

In [None]:
print(adata_pp.var.orig_highly_variable.value_counts())
print(adata_pp.var.highly_variable.value_counts())

### 04. Run PCA

Run Principal Component Analysis (PCA) on the highly variable genes obtained in the previous step. We run it on this subset as these variables are the ones explaaining most of the variance in the data. 

In [None]:
sc.tl.pca(adata_pp, use_highly_variable = True)

In [None]:
# Get loadings for each gene for each PC
df_loadings = pd.DataFrame(adata_pp.varm['PCs'], index=adata_pp.var_names)
# get rank of each loading for each PC
df_rankings = pd.DataFrame((-1 * df_loadings.values).argsort(0).argsort(0), index=df_loadings.index, columns=df_loadings.columns)
# c.f. with df_loadings.apply(scipy.stats.rankdata, axis=0)
# evaluate 
print("Top loadings for PC1...")
print(df_loadings[0].sort_values().tail())

In [None]:
# visualize loadings
sc.pl.pca_loadings(adata_pp, include_lowest=True, components=[1, 2, 3, 4, 5])

In [None]:
# select optimal number of PCs
sc.pl.pca_variance_ratio(adata_pp, log=True, n_pcs=50)

In [None]:
# Decide optimal number of PCs that explain most of the variation in the data
## We can calculate where the principal components start to elbow by taking the larger value of:
## The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation.
## The point where the percent change in variation between the consecutive PCs is less than 0.1%.
## We will start by calculating the first metric:

# calculate the percent of variation associated with each PC
pct = adata_pp.obsm['X_pca'].std(axis=0) / np.sum(adata_pp.obsm['X_pca'].std(axis=0)) * 100

# calculate cumulative percents for each PC
cumu = np.cumsum(pct)

# determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC is less than 5
co1 = np.where((cumu > 90) & (pct < 5))[0]
co1_index = co1[0] if len(co1) > 0 else None
print(co1_indexc 

# determine the difference between the variation of PC and subsequent PC
co2 = np.sort(np.where((pct[:-1] - pct[1:]) > 0.05)[0])[::-1] # before: 0.1
co2_index = co2[0] + 1 if len(co2) > 0 else None
print(co2_index)

# usually, we would choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
pcs = min(co1_index, co2_index) if co1_index is not None and co2_index is not None else None

print("PCs covering the majority of the variation:", pcs)

In [None]:
# Kaiser rule --> Keep PC with an eigenvalue of >=1

# extract eigenvalues from PCA results
eigenvalues = adata_pp.uns['pca']['variance']

# filter eigenvalues greater than or equal to 1
eigenvalues_gt_1 = [val for val in eigenvalues if val >= 1]

# get the corresponding principal component numbers
pcs_gt_1 = [i+1 for i, val in enumerate(eigenvalues) if val >= 1]

print(eigenvalues)
print(max(pcs_gt_1))

In [None]:
# 2/3 Variance Explanation
print(cumu)
np.where((cumu > 100*2/3))[0][0]

In [None]:
# define number of PCs
n_pcs=35

### 05. Compute Neighbors & Non Linear Dim. Reduction

In [None]:
sc.pp.neighbors(adata_pp, n_pcs=n_pcs)

In [None]:
sc.tl.umap(adata_pp)

In [None]:
sc.pl.umap(adata_pp, color=["sample", "Annotation_1.0"], wspace=1.5)

### Check Technical Sources of Variation

In [None]:
adata_pp

In [None]:
sc.pl.umap(
    adata_pp,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2
)

In [None]:
sc.pl.umap(
    adata_pp,
    color=["patient", "timepoint"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    use_raw=False,
    ncols=2,
    save=
)

In [None]:
sc.settings.figdir = os.path.join(work_dir, "figures", "TFM", "FigS1/")
sc.set_figure_params(dpi=120, dpi_save=300, format='pdf')

In [None]:
sc.pl.umap(
    adata_pp,
    color=["timepoint"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    use_raw=False,
    palette=["darkviolet", "coral"]
    save="Uni"
)

In [None]:
import matplotlib.cm as cm
color_map = cm.get_cmap('Spectral')
color_palette = [color_map(i/4) for i in range(4, -1, -1)]
print(len(color_palette))

In [None]:
color_palette = sns.color_palette("Spectral", 5)
print(color_palette)
color_palette.pop(3)
sns.color_palette("Spectral", 5)

In [None]:
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "main_analysis", "final_figures/")
sc.set_figure_params(dpi=120, dpi_save=300, format='png')

In [None]:
# save adata object
adata_pp.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_12-03-24.h5ad"))

## 06. Integration

### 6.1 scVI

In [None]:
adata_pp = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_12-03-24.h5ad"))

In [None]:
# define integration vars
batch_key = "subproject"

In [None]:
# create object specific to scVI with just HVG
adata_scvi = adata_pp[:, adata_pp.var["highly_variable"]].copy()

In [None]:
print(adata_pp.shape)
print(adata_scvi.shape)

In [None]:
# prepare object
scvi.model.SCVI.setup_anndata(adata_scvi, 
                              layer="rawcounts", 
                              batch_key=batch_key,
                              #continuous_covariate_keys=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts"],
                              #categorical_covariate_keys=["patient", "timepoint"]
                             )
adata_scvi

In [None]:
# create the model
model_scvi = scvi.model.SCVI(adata_scvi)
model_scvi

In [None]:
# visualize model
model_scvi.view_anndata_setup()

In [None]:
# train the model
max_epochs_scvi = np.min([round((20000 / adata_scvi.n_obs) * 400), 400])
max_epochs_scvi

In [None]:
model_scvi.train()

In [None]:
# extract the embedding
adata_scvi.obsm["X_scVI"] = model_scvi.get_latent_representation() #just embedding used in further steps
adata_scvi.layers["scvi_normalized"] = model_scvi.get_normalized_expression(library_size=10e4) # would allow us to perforem DE

In [None]:
# transfer scVI latent space to the full anndata object
adata_pp.obsm["X_scVI"] = model_scvi.get_latent_representation()

In [None]:
# save the model
model_scvi.save(os.path.join(work_dir, "data", "models", "Combined_SCR_C02_scVI_integration_model_15-03-24"), overwrite=True)

In [None]:
## batch-corrected visualization (HVG)
#sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
#sc.tl.umap(adata_scvi)
#adata_scvi

In [None]:
# batch-corrected visualization (full)
sc.pp.neighbors(adata_pp, use_rep="X_scVI")
sc.tl.umap(adata_pp)
adata_pp

In [None]:
#sc.pl.umap(adata_scvi, color=["Annotation_1.0", "sample", "Annotation_2.0", "subproject", "project", "patient", "timepoint"], wspace=1, ncols=2) #HVG

In [None]:
sc.pl.umap(adata_pp, color=["Annotation_1.0", "sample", "Annotation_2.0", "subproject", "project", "patient", "timepoint"], wspace=1, ncols=2) #FULL - check it is the same as using just the HVG

In [None]:
sc.pl.umap(adata_pp, color=["Annotation_1.0"], wspace=1, ncols=2) #HVG

In [None]:
## save HVG adata object
#adata_scvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_HVG_scVI-integrated_12-03-24.h5ad"))

In [None]:
# save full adata object
adata_pp.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scVI-integrated_12-03-24.h5ad"))

### 6.2 Harmony

In [None]:
adata_pp = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_12-03-24.h5ad"))

In [None]:
adata_pp

In [None]:
# define integration vars
batch_key = "subproject"

In [None]:
# create harmony specific object
adata_harmony = adata_pp

In [None]:
# run pca again
sc.tl.pca(adata_harmony, use_highly_variable=True)

In [None]:
import math #set parameters to avoid convergence on an early iteration
sc.external.pp.harmony_integrate(adata_harmony, key=batch_key, plot_convergence=True, epsilon_cluster = -math.inf, epsilon_harmony = -math.inf, max_iter_harmony=50)

In [None]:
adata_harmony

In [None]:
# transfer harmony embedding to the full object
adata_pp.obsm["X_pca_harmony"] = adata_harmony.obsm["X_pca_harmony"]

In [None]:
# batch-corrected visualization (full)
sc.pp.neighbors(adata_pp, use_rep="X_pca_harmony")
sc.tl.umap(adata_pp)
adata_pp

In [None]:
sc.pl.umap(adata_pp, color=["Annotation_1.0", "sample", "Annotation_2.0", "subproject", "project", "patient", "timepoint"], wspace=1, ncols=2)

In [None]:
sc.pl.umap(adata_pp, color=["Annotation_1.0"], wspace=1, ncols=2) 

In [None]:
# save full adata object
adata_pp.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_harmony-integrated_12-03-24.h5ad"))

### 6.3 scGen

In [None]:
adata_pp = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_12-03-24.h5ad"))

In [None]:
# define integration vars
label_key = "Annotation_1.0"
batch_key = "subproject"

In [None]:
# create object specific to scVI with just HVG
adata_scgen = adata_pp[:, adata_pp.var["highly_variable"]].copy()

In [None]:
print(adata_pp.shape)
print(adata_scgen.shape)

In [None]:
adata_scgen.obs["Annotation_1.0"] = adata_scgen.obs["Annotation_1.0"].tolist()
sc.pp.neighbors(adata_scgen)
sc.tl.umap(adata_scgen)

In [None]:
# prepare object
scgen.SCGEN.setup_anndata(adata_scgen, batch_key=batch_key, labels_key=label_key)


In [None]:
# create the model
model_scgen = scgen.SCGEN(adata_scgen)

In [None]:
# train the model
model_scgen.train()

In [None]:
# batch removal
adata_scgen = model_scgen.batch_removal()
adata_scgen

In [None]:
# transfer corrected embedding to the full object
adata_pp.obsm["scGen_corrected_latent"] = adata_scgen.obsm["corrected_latent"]

In [None]:
sc.pp.neighbors(adata_scgen, use_rep="scGENcorrected_latent")
sc.tl.umap(adata_scgen)

In [None]:
sc.pl.umap(adata_scgen, color=["Annotation_1.0", "sample", "Annotation_2.0", "subproject", "project", "patient", "timepoint"], wspace=1, ncols=2)

In [None]:
sc.pl.umap(adata_scgen, color=["Annotation_1.0"], wspace=1, ncols=2) 

In [None]:
# save full adata object
adata_pp.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scGen-integrated_12-03-24.h5ad"))

## 07. Level 1 Annotation

### 7.1 scVI Integrated Object

In [None]:
# read adata object
adata_scVI = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scVI-integrated_12-03-24.h5ad"))

In [None]:
adata_scVI

In [None]:
# identfy technical sources of variation
sc.pl.umap(
    adata_scvi,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2
)

In [None]:
sc.pl.umap(
    adata_scvi,
    color=["PTPRC", "CD4", "CD8A", "KLRF1", "MS4A1", "CD68", "EPCAM", "COL1A1", "CLDN5", "ALB"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    cmap="OrRd",
    use_raw=False
)

In [None]:
# check specific T cell markers to evaluate integration
sc.pl.umap(
    adata_scvi,
    color=["PTPRC", "CD4", "CD8B", "CCR7", "IL7R", "ITGAE", "ZNF683", "IFNG", "GZMK", "PDCD1", "HAVCR2", "FOXP3", "CXCL13", "MKI67", "TRDC", "TRAV1-2", "KLRF1"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    cmap="OrRd",
    use_raw=False
)

In [None]:
# perform clustering
sc.tl.leiden(adata_scvi, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata_scvi, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata_scvi, key_added="leiden_res0_75", resolution=0.75)
sc.tl.leiden(adata_scvi, key_added="leiden_res1", resolution=1.0)

In [None]:
# visulize clustering
sc.pl.umap(
    adata_scvi,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res0_75", "leiden_res1"],
    legend_loc="on data"
)

In [None]:
sc.pl.umap(
    adata_scvi,
    color=["leiden_res0_5", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
## subcluster cluster 18 (EMT-like / Hepatocytes), same for cluster 7 (includes CD8 T to NK)
sc.tl.leiden(adata_scvi, key_added="leiden_res0_5_1", resolution=0.1, restrict_to = ("leiden_res0_5", ["18"]))
sc.tl.leiden(adata_scvi, key_added="leiden_res0_5_1", resolution=0.1, restrict_to = ("leiden_res0_5_1", ["7"]))

In [None]:
sc.pl.umap(
    adata_scvi,
    color=["leiden_res0_5_1", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata_scvi, groupby="leiden_res0_5_1", method="wilcoxon", key_added="dea_leiden_res0_5_1", use_raw=False#, layer="rawcounts" # do not use raw!
)

In [None]:
'''
sc.tl.filter_rank_genes_groups(
    adata_scvi,
    min_in_group_fraction=0.2,
    max_out_group_fraction=0.2,
    key="dea_leiden_res0_5_1",
    key_added="dea_leiden_res0_5_1_filtered",
)
'''

In [None]:
# dictionary to hold DataFrames for each cluster
dfs = {}  

# iterate over clusters 
for cluster in adata_scvi.obs["leiden_res0_5_1"].unique().sort_values():
    
    # get marker genes for the current cluster
    marker_genes = adata_scvi.uns["dea_leiden_res0_5_1"]["names"][str(cluster)].tolist()  
    scores = adata_scvi.uns["dea_leiden_res0_5_1"]["scores"][str(cluster)].tolist()  
    pvals = adata_scvi.uns["dea_leiden_res0_5_1"]["pvals"][str(cluster)].tolist() 
    pvals_adj = adata_scvi.uns["dea_leiden_res0_5_1"]["pvals_adj"][str(cluster)].tolist()
    logfoldchanges = adata_scvi.uns["dea_leiden_res0_5_1"]["logfoldchanges"][str(cluster)].tolist() 

    
    # create a df for the current cluster's marker genes
    df = pd.DataFrame({"Gene": marker_genes, "Score": scores, "Pval": pvals, "PvalAdj": pvals_adj, "Log2Fold": logfoldchanges})

    # remove non signficant genes
    df = df[df["PvalAdj"] < 0.01]
    df = df[df["Pval"] < 0.01]
    
    # include just positive log fold changes 
    df = df[df["Log2Fold"] > 0.4]
    
    # scale score 
    scores = df["Score"]
    scaled_scores = (np.array(scores) - np.min(scores)) / (np.max(scores) - np.min(scores)) 

    # scale log2fold
    logfoldchanges = df["Log2Fold"]
    scaled_logfoldchanges = (np.array(logfoldchanges) - np.min(logfoldchanges)) / (np.max(logfoldchanges) - np.min(logfoldchanges)) 
    
    # add scaled metrics as new df columns
    df["ScaledScore"] = scaled_scores
    df["ScaledLog2Fold"] = scaled_logfoldchanges
    
    # compute the final score
    df["CustomScore"] = (df["ScaledLog2Fold"] * df["ScaledScore"]) 

    # Sort df based on the final score in descending order
    df = df.sort_values(by="CustomScore", ascending=False)
    
    # add the DataFrame to the dictionary with the cluster as key
    dfs[cluster] = df

# save to excel
with pd.ExcelWriter(os.path.join(work_dir, "data", "markers", "Combined", "Combined_scVI-integrated_res0_5_markers_13-03-24.xlsx")) as writer:
    # save each cluster markers into a separate sheet
    for cluster, df in dfs.items():
        df.to_excel(writer, sheet_name=f'Cluster_{cluster}', index=False)

In [None]:
df.sort_values(by = ["ScaledScore"], ascending = False)

In [None]:
# known markers dotplot
markers = [ 
    "CD4",
    "CD8B",
    "KLRF1",
    "TRAV1-2",
    "FOXP3",
    "MKI67",
    "LYZ",
    "LILRA4",
    "TPSAB1",
    "EPCAM",
    "CLDN4",
    "COL1A1",
    "ALB",
    "CDH1",
]


sc.pl.dotplot(
    adata_scvi,
    groupby="leiden_res0_5_1",
    var_names=markers,
    use_raw=False
)


In [None]:
sc.pl.umap(
    adata_scvi,
    color=["leiden_res0_5_1", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# save clustered anndata object
adata_scvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scVI-integrated_cl_12-03-24.h5ad"))

In [None]:
# read clustered anndata object
adata_scvi = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scVI-integrated_cl_12-03-24.h5ad"))

In [None]:
# carry out level 1 annotation 
annotation = {
    "0":       "CD4 T",
    "1":       "CD8 T",
    "2":       "Tumor",
    "3":       "CD8 T",
    "4":       "Myeloid",
    "5":       "Tumor",
    "6":       "NK",
    "7,0":     "CD8 T",  
    "7,1":     "NK",
    "8":       "CD4 T",      
    "9":       "CD8 T",
    "10":      "CD4 T",
    "11":      "CD8 T", 
    "12":      "B Cell",
    "13":      "Endothelial",
    "14":      "CAF",
    "15":      "Tumor",
    "16":      "pDC",
    "17":      "Plasma",
    "18,0":    "Liver Epithelial",          # not sure
    "18,1":    "Tumor",             # not sure
    "18,2":    "Liver Epithelial",
    "19":      "Myeloid",           # not sure
    "20":      "Myeloid",
    "21":      "Myeloid"  # mast     
}

adata_scvi.obs["Level_1_Annotation"] = adata_scvi.obs.leiden_res0_5_1.map(annotation)

In [None]:
# visualize annotation
sc.pl.umap(
    adata_scvi,
    color=["leiden_res0_5_1", "Level_1_Annotation", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# save annotated anndata object
adata_scvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_scVI-integrated_annot_20-03-24.h5ad"))

In [None]:
# read scVI integrated & annotated anndata object
adata_scvi = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_scVI-integrated_annot_20-03-24.h5ad"))

### 7.2 Harmony Integrated Object

In [None]:
# read adata object
adata_harmony = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_harmony-integrated_12-03-24.h5ad"))

In [None]:
adata_harmony

In [None]:
# identfy technical sources of variation
sc.pl.umap(
    adata_harmony,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2
)

In [None]:
sc.pl.umap(
    adata_harmony,
    color=["PTPRC", "CD4", "CD8A", "KLRF1", "MS4A1", "CD68", "EPCAM", "COL1A1", "CLDN5", "ALB"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    cmap="OrRd",
    use_raw=False
)

In [None]:
# check specific T cell markers to evaluate integration
sc.pl.umap(
    adata_harmony,
    color=["PTPRC", "CD4", "CD8B", "CCR7", "IL7R", "ITGAE", "ZNF683", "IFNG", "GZMK", "PDCD1", "HAVCR2", "FOXP3", "CXCL13", "MKI67", "TRDC", "TRAV1-2", "KLRF1"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    cmap="OrRd",
    use_raw=False
)

In [None]:
# perform clustering
sc.tl.leiden(adata_harmony, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata_harmony, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata_harmony, key_added="leiden_res0_75", resolution=0.75)
sc.tl.leiden(adata_harmony, key_added="leiden_res1", resolution=1.0)

In [None]:
# visulize clustering
sc.pl.umap(
    adata_harmony,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res0_75", "leiden_res1"],
    legend_loc="on data"
)

In [None]:
sc.pl.umap(
    adata_harmony,
    color=["leiden_res0_5", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=9
)

In [None]:
## subcluster cluster 6, 8, 14
sc.tl.leiden(adata_harmony, key_added="leiden_res0_5_1", resolution=0.2, restrict_to = ("leiden_res0_5", ["6"]))
sc.tl.leiden(adata_harmony, key_added="leiden_res0_5_1", resolution=0.2, restrict_to = ("leiden_res0_5_1", ["8"]))
sc.tl.leiden(adata_harmony, key_added="leiden_res0_5_1", resolution=0.1, restrict_to = ("leiden_res0_5_1", ["14"]))

In [None]:
sc.pl.umap(
    adata_harmony,
    color=["leiden_res0_5_1", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata_harmony, groupby="leiden_res0_5_1", method="wilcoxon", key_added="dea_leiden_res0_5_1", use_raw=False#, layer="rawcounts" # do not use raw!
)

In [None]:
'''
sc.tl.filter_rank_genes_groups(
    adata_scvi,
    min_in_group_fraction=0.2,
    max_out_group_fraction=0.2,
    key="dea_leiden_res0_5_1",
    key_added="dea_leiden_res0_5_1_filtered",
)
'''

In [None]:
# dictionary to hold DataFrames for each cluster
dfs = {}  

# iterate over clusters 
for cluster in adata_harmony.obs["leiden_res0_5_1"].unique().sort_values():
    
    # get marker genes for the current cluster
    marker_genes = adata_harmony.uns["dea_leiden_res0_5_1"]["names"][str(cluster)].tolist()  
    scores = adata_harmony.uns["dea_leiden_res0_5_1"]["scores"][str(cluster)].tolist()  
    pvals = adata_harmony.uns["dea_leiden_res0_5_1"]["pvals"][str(cluster)].tolist() 
    pvals_adj = adata_harmony.uns["dea_leiden_res0_5_1"]["pvals_adj"][str(cluster)].tolist()
    logfoldchanges = adata_harmony.uns["dea_leiden_res0_5_1"]["logfoldchanges"][str(cluster)].tolist() 

    
    # create a df for the current cluster's marker genes
    df = pd.DataFrame({"Gene": marker_genes, "Score": scores, "Pval": pvals, "PvalAdj": pvals_adj, "Log2Fold": logfoldchanges})

    # remove non signficant genes
    df = df[df["PvalAdj"] < 0.01]
    df = df[df["Pval"] < 0.01]
    
    # include just positive log fold changes 
    df = df[df["Log2Fold"] > 0.4]
    
    # scale score 
    scores = df["Score"]
    scaled_scores = (np.array(scores) - np.min(scores)) / (np.max(scores) - np.min(scores)) 

    # scale log2fold
    logfoldchanges = df["Log2Fold"]
    scaled_logfoldchanges = (np.array(logfoldchanges) - np.min(logfoldchanges)) / (np.max(logfoldchanges) - np.min(logfoldchanges)) 
    
    # add scaled metrics as new df columns
    df["ScaledScore"] = scaled_scores
    df["ScaledLog2Fold"] = scaled_logfoldchanges
    
    # compute the final score
    df["CustomScore"] = (df["ScaledLog2Fold"] * df["ScaledScore"]) 

    # Sort df based on the final score in descending order
    df = df.sort_values(by="CustomScore", ascending=False)
    
    # add the DataFrame to the dictionary with the cluster as key
    dfs[cluster] = df

# save to excel
with pd.ExcelWriter(os.path.join(work_dir, "data", "markers", "Combined", "Combined_harmony-integrated_res0_5_markers_13-03-24.xlsx")) as writer:
    # save each cluster markers into a separate sheet
    for cluster, df in dfs.items():
        df.to_excel(writer, sheet_name=f'Cluster_{cluster}', index=False)

In [None]:
# known markers dotplot
markers = [ 
    "CD4",
    "CD8B",
    "KLRF1",
    "TRAV1-2",
    "FOXP3",
    "MKI67",
    "LYZ",
    "LILRA4",
    "TPSAB1",
    "EPCAM",
    "CLDN4",
    "COL1A1",
    "ALB",
    "CDH1",
]

sc.pl.dotplot(
    adata_harmony,
    groupby="leiden_res0_5_1",
    var_names=markers,
    use_raw=False
)

In [None]:
sc.pl.umap(
    adata_harmony,
    color=["leiden_res0_5_1", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# save clustered anndata object
adata_harmony.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_harmony-integrated_cl_12-03-24.h5ad"))

In [None]:
# read clustered anndata object
adata_harmony = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_harmony-integrated_cl_12-03-24.h5ad"))

In [None]:
# carry out level 1 annotation 
annotation = {
    "0":       "CD8 T",
    "1":       "Tumor",
    "2":       "CD4 T",
    "3":       "Myeloid",
    "4":       "CD4 T",
    "5":       "NK",
    "6,0":     "CD8 T",
    "6,1":     "NK",
    "6,2":     "CD8 T",
    "7":       "CD8 T",  
    "8,0":     "CD8 T",
    "8,1":     "Plasma",
    "9":       "Tumor",
    "10":      "B Cell",
    "11":      "Endothelial", 
    "12":      "CAF",
    "13":      "pDC",
    "14,0":    "Liver Epithelial", #not sure
    "14,1":    "Liver Epithelial", #not sure
    "14,2":    "Liver Epithelial", #not sure
    "14,3":    "Myeloid", #not sure
    "15":      "Myeloid",  #mast
    "16":      "Myeloid",
    "17":      "Noise", #?????  very few cells
    "18":      "Tumor" # very few cells
}

adata_harmony.obs["Level_1_Annotation"] = adata_harmony.obs.leiden_res0_5_1.map(annotation)

In [None]:
# visualize annotation
sc.pl.umap(
    adata_harmony,
    color=["leiden_res0_5_1", "Level_1_Annotation", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# save annotated anndata object
adata_harmony.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_harmony-integrated_annot_20-03-24.h5ad"))

In [None]:
# read harmony integrated & annotated anndata object
adata_harmony = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_harmony-integrated_annot_20-03-24.h5ad"))

### 7.3 scGen Integrated Object

In [None]:
# read adata object
adata_scgen = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scGen-integrated_12-03-24.h5ad"))

In [None]:
adata_scgen

In [None]:
# identfy technical sources of variation
sc.pl.umap(
    adata_scgen,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2
)

In [None]:
sc.pl.umap(
    adata_scgen,
    color=["PTPRC", "CD4", "CD8A", "KLRF1", "MS4A1", "CD68", "EPCAM", "COL1A1", "CLDN5", "ALB"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    cmap="OrRd",
    use_raw=False
)

In [None]:
# check specific T cell markers to evaluate integration
sc.pl.umap(
    adata_scgen,
    color=["PTPRC", "CD4", "CD8B", "CCR7", "IL7R", "ITGAE", "ZNF683", "IFNG", "GZMK", "PDCD1", "HAVCR2", "FOXP3", "CXCL13", "MKI67", "TRDC", "TRAV1-2", "KLRF1"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    cmap="OrRd",
    use_raw=False,
)

In [None]:
# perform clustering
sc.tl.leiden(adata_scgen, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata_scgen, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata_scgen, key_added="leiden_res0_75", resolution=0.75)
sc.tl.leiden(adata_scgen, key_added="leiden_res1", resolution=1.0)

In [None]:
# visulize clustering
sc.pl.umap(
    adata_scgen,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res0_75", "leiden_res1"],
    legend_loc="on data"
)

In [None]:
sc.pl.umap(
    adata_scgen,
    color=["leiden_res0_5", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=9,
)

In [None]:
## subcluster cluster 6, 8, 14
sc.tl.leiden(adata_scgen, key_added="leiden_res0_5_1", resolution=0.1, restrict_to = ("leiden_res0_5", ["6"]))
sc.tl.leiden(adata_scgen, key_added="leiden_res0_5_1", resolution=0.1, restrict_to = ("leiden_res0_5_1", ["11"]))

In [None]:
sc.pl.umap(
    adata_scgen,
    color=["leiden_res0_5_1", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata_scgen, groupby="leiden_res0_5_1", method="wilcoxon", key_added="dea_leiden_res0_5_1", use_raw=False
)

In [None]:
'''
sc.tl.filter_rank_genes_groups(
    adata_scvi,
    min_in_group_fraction=0.2,
    max_out_group_fraction=0.2,
    key="dea_leiden_res0_5_1",
    key_added="dea_leiden_res0_5_1_filtered",
)
'''

In [None]:
# dictionary to hold DataFrames for each cluster
dfs = {}  

# iterate over clusters 
for cluster in adata_scgen.obs["leiden_res0_5_1"].unique().sort_values():
    
    # get marker genes for the current cluster
    marker_genes = adata_scgen.uns["dea_leiden_res0_5_1"]["names"][str(cluster)].tolist()  
    scores = adata_scgen.uns["dea_leiden_res0_5_1"]["scores"][str(cluster)].tolist()  
    pvals = adata_scgen.uns["dea_leiden_res0_5_1"]["pvals"][str(cluster)].tolist() 
    pvals_adj = adata_scgen.uns["dea_leiden_res0_5_1"]["pvals_adj"][str(cluster)].tolist()
    logfoldchanges = adata_scgen.uns["dea_leiden_res0_5_1"]["logfoldchanges"][str(cluster)].tolist() 

    
    # create a df for the current cluster's marker genes
    df = pd.DataFrame({"Gene": marker_genes, "Score": scores, "Pval": pvals, "PvalAdj": pvals_adj, "Log2Fold": logfoldchanges})

    # remove non signficant genes
    df = df[df["PvalAdj"] < 0.01]
    df = df[df["Pval"] < 0.01]
    
    # include just positive log fold changes 
    df = df[df["Log2Fold"] > 0.4]
    
    # scale score 
    scores = df["Score"]
    scaled_scores = (np.array(scores) - np.min(scores)) / (np.max(scores) - np.min(scores)) 

    # scale log2fold
    logfoldchanges = df["Log2Fold"]
    scaled_logfoldchanges = (np.array(logfoldchanges) - np.min(logfoldchanges)) / (np.max(logfoldchanges) - np.min(logfoldchanges)) 
    
    # add scaled metrics as new df columns
    df["ScaledScore"] = scaled_scores
    df["ScaledLog2Fold"] = scaled_logfoldchanges
    
    # compute the final score
    df["CustomScore"] = (df["ScaledLog2Fold"] * df["ScaledScore"]) 

    # Sort df based on the final score in descending order
    df = df.sort_values(by="CustomScore", ascending=False)
    
    # add the DataFrame to the dictionary with the cluster as key
    dfs[cluster] = df

# save to excel
with pd.ExcelWriter(os.path.join(work_dir, "data", "markers", "Combined", "Combined_scGen-integrated_res0_5_markers_13-03-24.xlsx")) as writer:
    # save each cluster markers into a separate sheet
    for cluster, df in dfs.items():
        df.to_excel(writer, sheet_name=f'Cluster_{cluster}', index=False)

In [None]:
# known markers dotplot
markers = [ 
    "CD4",
    "CD8B",
    "KLRF1",
    "TRAV1-2",
    "FOXP3",
    "MKI67",
    "LYZ",
    "LILRA4",
    "TPSAB1",
    "EPCAM",
    "CLDN4",
    "COL1A1",
    "ALB",
    "CDH1",
]

sc.pl.dotplot(
    adata_scgen,
    groupby="leiden_res0_5_1",
    var_names=markers,
    use_raw=False
)

In [None]:
sc.pl.umap(
    adata_scgen,
    color=["leiden_res0_5_1", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# save clustered anndata object
adata_harmony.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scGen-integrated_cl_12-03-24.h5ad"))

In [None]:
# read clustered anndata object
adata_harmony = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_all_scGen-integrated_cl_12-03-24.h5ad"))

In [None]:
# carry out level 1 annotation 
annotation = {
    "0":       "CD8 T",
    "1":       "CD4 T",
    "2":       "Tumor",
    "3":       "NK",
    "4":       "Myeloid",
    "5":       "CD8 T",
    "6,0":     "Endothelial",
    "6,1":     "CAF",
    "6,2":     "Endothelial",  # lymphatic vessels
    "6,3":     "CAF",
    "7":       "CD8 T",  
    "8":       "Tumor",
    "9":       "B Cell",
    "10":      "pDC",
    "11,0":    "Plasma", 
    "11,1":    "Liver Epithelial", 
    "12":      "Liver Epithelial",
    "13":      "Myeloid", #cDC
    "14":      "Myeloid", #mast
    "15":      "Noise", 
}

adata_scgen.obs["Level_1_Annotation"] = adata_scgen.obs.leiden_res0_5_1.map(annotation)

In [None]:
# visualize annotation
sc.pl.umap(
    adata_scgen,
    color=["leiden_res0_5_1", "Level_1_Annotation", "Annotation_1.0"],
    legend_loc="on data",
    legend_fontsize=7
)

In [None]:
# save annotated anndata object
adata_scgen.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_scGen-integrated_annot_20-03-24.h5ad"))

In [None]:
# read scGen integrated & annotated anndata object
adata_scgen = sc.read(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_scGen-integrated_annot_20-03-24.h5ad"))

### 08. Annotation Comparison (Mini Integration Benchmark)

In [None]:
# https://docs.scvi-tools.org/en/latest/tutorials/notebooks/scrna/scanvi_fix.html

In [None]:
# load anndata objects
adata_scvi = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_scVI-integrated_annot_20-03-24.h5ad"))
adata_harmony = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_harmony-integrated_annot_20-03-24.h5ad"))
adata_scgen = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_pp_scGen-integrated_annot_20-03-24.h5ad"))

In [None]:
adata = adata_scvi.copy()

In [None]:
# transfer embeddings, and annotation

adata.obs["scVI_Annotation_1.0"] = adata_scvi.obs["Level_1_Annotation"]
adata.obs["harmony_Annotation_1.0"] = adata_harmony.obs["Level_1_Annotation"]
adata.obs["scGen_Annotation_1.0"] = adata_scgen.obs["Level_1_Annotation"]

adata.obs["Prior_Annotation_1.0"] = adata_scgen.obs["Annotation_1.0"]
adata.obs["Prior_Annotation_2.0"] = adata_scgen.obs["Annotation_2.0"]

adata.obsm["X_pca_harmony"] = adata_harmony.obsm["X_pca_harmony"]
adata.obsm["X_scVI"] = adata_scvi.obsm["X_scVI"]
adata.obsm["scGen_corrected_latent"] = adata_scgen.obsm["scGen_corrected_latent"]

In [None]:
adata.obs.drop("Annotation_1.0", axis=1, inplace=True)
adata.obs["Annotation_1.0"] = adata.obs["Level_1_Annotation"]
adata.obs.drop("Level_1_Annotation", axis=1, inplace=True)

In [None]:
# adjust annotation labels to have consistency
# consensus annotation: T Cell, B Cell, NK, Plasma, pDC, Tumor, Endothelial, CAF, Liver Epithelial, Myeloid
print(adata.obs["Prior_Annotation_1.0"].unique())
adata.obs["Prior_Annotation_1.0"] = adata.obs["Prior_Annotation_1.0"].replace({"T Cells": "T Cell", 
                                                                        "B Cells": "B Cell",
                                                                        "CAFs": "CAF",
                                                                        "Plasma Cells": "Plasma",
                                                                        "pDCs": "pDC",
                                                                        "Hepatocytes": "Liver Epithelial",
                                                                        "EMT-like": "Liver Epithelial",
                                                                        })
print(adata.obs["Prior_Annotation_1.0"].unique())

In [None]:
adata.obs

In [None]:
# compute neighbors and umap for all embeddings

# scGen
sc.pp.neighbors(adata, use_rep="scGen_corrected_latent", key_added="scGen_neighbors")
sc.tl.umap(adata, neighbors_key="scGen_neighbors")
adata.obsm['scGen_umap'] = adata.obsm['X_umap'].copy()

# scVI
sc.pp.neighbors(adata, use_rep="X_scVI", key_added="scVI_neighbors")
sc.tl.umap(adata, neighbors_key="scVI_neighbors")
adata.obsm['scVI_umap'] = adata.obsm['X_umap'].copy()

# scGen
sc.pp.neighbors(adata, use_rep="X_pca_harmony", key_added="harmony_neighbors")
sc.tl.umap(adata, neighbors_key="harmony_neighbors")
adata.obsm['harmony_umap'] = adata.obsm['X_umap'].copy()

In [None]:
# compare embeddings
plt.rcParams["figure.figsize"] = (3, 3)
sc.settings.figdir = os.path.join(work_dir, "figures", "TFM", "FigS1/")
sc.set_figure_params(dpi=120, dpi_save=300, format='pdf')

sc.pl.embedding(
    adata,
    basis='scVI_umap',
    color=['Prior_Annotation_1.0', 'scVI_Annotation_1.0', 'subproject'],
    use_raw=False,
    wspace=.3,
    legend_fontsize=8,
    frameon=False,
    title=['Prior Annotation 1.0', 'scVI Annotation 1.0', 'Subproject'],
    ncols=3,
    show=False,
    save="integration_scVI.pdf"
)

sc.pl.embedding(
    adata,
    basis='harmony_umap',
    color=['Prior_Annotation_1.0', 'harmony_Annotation_1.0', 'subproject'],
    use_raw=False,
    wspace=.3,
    legend_fontsize=8,
    frameon=False,
    title=['Prior Annotation 1.0', 'Harmony Annotation 1.0', 'Subproject'],
    ncols=3,
    show=False,
    save="integration_harmony.pdf"
)

sc.pl.embedding(
    adata,
    basis='scGen_umap',
    color=['Prior_Annotation_1.0', 'scGen_Annotation_1.0', 'subproject'],
    use_raw=False,
    wspace=.3,
    legend_fontsize=8,
    frameon=False,
    title=['Prior Annotation 1.0', 'scGen Annotation 1.0', 'Subproject'],
    ncols=3,
    show=False,
    save="integration_scGen.pdf"
)

In [None]:
# plot confusion matrices
import seaborn as sns

plt.rcParams["figure.figsize"] = [18, 12]

fig, (ax1, ax2, ax3) = plt.subplots(ncols=1, nrows=3)

cmtx1 = sc.metrics.confusion_matrix("scVI_Annotation_1.0", "Prior_Annotation_1.0", adata.obs)
cmtx1 = cmtx1.reindex(columns=["B Cell", "CAF", "T Cell", "Endothelial", "Liver Epithelial", "Myeloid", "NK", "Plasma", "Tumor", "pDC"])

sns.heatmap(cmtx1, annot=True, 
                 cbar=False, 
                 square=True,
                 cmap="copper",
                 ax=ax1,
                 annot_kws={'size': 5}
                )
ax1.set_xlabel('', fontsize=12)
ax1.set_ylabel('scVI Annotation 1.0', fontsize=12)
ax1.set_xticklabels(ax1.get_xmajorticklabels(), fontsize = 10)
ax1.set_yticklabels(ax1.get_ymajorticklabels(), fontsize = 10)

cmtx2 = sc.metrics.confusion_matrix("harmony_Annotation_1.0", "Prior_Annotation_1.0", adata.obs)
cmtx2 = cmtx2.reindex(columns=["B Cell", "CAF", "T Cell", "Endothelial", "Liver Epithelial", "Myeloid", "NK", "Plasma", "Tumor", "pDC"])

sns.heatmap(cmtx2, annot=True, 
                 cbar=False, 
                 square=True,
                 cmap="copper",
                 ax=ax2,
                 annot_kws={'size': 5}
                )
ax2.set_xlabel('', fontsize=12)
ax2.set_ylabel('Harmony Annotation 1.0', fontsize=12)
ax2.set_xticklabels(ax2.get_xmajorticklabels(), fontsize = 10)
ax2.set_yticklabels(ax2.get_ymajorticklabels(), fontsize = 10)

cmtx3 = sc.metrics.confusion_matrix("scGen_Annotation_1.0", "Prior_Annotation_1.0", adata.obs)
cmtx3 = cmtx3.reindex(columns=["B Cell", "CAF", "T Cell", "Endothelial", "Liver Epithelial", "Myeloid", "NK", "Plasma", "Tumor", "pDC"])

sns.heatmap(cmtx3, annot=True, 
                 cbar=False, 
                 square=True,
                 cmap="copper",
                 ax=ax3,
                 annot_kws={'size': 5}
                )
ax3.set_xlabel('Groundtruth Annotation 1.0', fontsize=12)
ax3.set_ylabel('scGen Annotation 1.0', fontsize=12)
ax3.set_xticklabels(ax3.get_xmajorticklabels(), fontsize = 10)
ax3.set_yticklabels(ax3.get_ymajorticklabels(), fontsize = 10)

fig.subplots_adjust(wspace=.6, hspace=.6)

plt.savefig(fig_dir+'Confusion_matrices_integration_bench.png', dpi=300)
plt.savefig(os.path.join(work_dir, "figures", "TFM", "FigS1",'Confusion_matrices_integration_bench.pdf'), dpi=300)

plt.show()

In [None]:
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "main_analysis", "final_figures/")
sc.set_figure_params(dpi=120, dpi_save=300, format='png')

In [None]:
import seaborn as sns
sns.__version__

In [None]:
from scib_metrics.benchmark import Benchmarker
bm = Benchmarker(
    adata,
    batch_key="subproject",
    label_key="Prior_Annotation_1.0",
    embedding_obsm_keys=[
        "X_pca",
        "X_pca_harmony",
        "X_scVI",
        "scGen_corrected_latent"
    ],
    n_jobs=16,
        
)
bm.benchmark()

In [None]:
bm.plot_results_table(min_max_scale=False)

In [None]:
# add random as control -- randomized PCA
adata.varm

In [None]:
adata.obs["Annotation_1.0"].unique()

In [None]:
# immune cells only benchmark
adata_immune = adata[adata.obs["Annotation_1.0"].isin(["CD8 T", "CD4 T", "NK", "pDC", "Myeloid", "Plasma", "B Cell"])].copy()

In [None]:
adata_immune.obs["Annotation_1.0"]

In [None]:
from scib_metrics.benchmark import Benchmarker
bm2 = Benchmarker(
    adata_immune,
    batch_key="subproject",
    label_key="Prior_Annotation_1.0",
    embedding_obsm_keys=[
        "X_pca",
        "X_pca_harmony",
        "X_scVI",
        "scGen_corrected_latent"
    ],
    n_jobs=16,
        
)
bm2.benchmark()

In [None]:
bm2.plot_results_table(min_max_scale=False)

In [None]:
# save anndata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

### Integration Method Selection

In [None]:
# adjust parameters to selected method
adata.obs["Annotation_1.0"] = adata.obs["scVI_Annotation_1.0"]
adata.obsm["X_umap"] = adata.obsm["scVI_umap"].copy()
adata.uns["neighbors"] = adata.uns["scVI_neighbors"].copy()

In [None]:
# add response and ICI status categorical variables in metadata
adata.obs["response"] = np.where(adata.obs["patient"] == "08", "SD", "PD")

adata.obs["ICI_status"] = [None]*len(adata.obs_names)
for cell in adata.obs_names:
    if adata.obs.loc[cell, "timepoint"] == "SCR":
        adata.obs.loc[cell, "ICI_status"] = "-ICI"
    elif ((adata.obs.loc[cell, "timepoint"] == "C02") & (adata.obs.loc[cell, "response"] == "PD")):
        adata.obs.loc[cell, "ICI_status"] = "+ICI/PD"
    else:
        adata.obs.loc[cell, "ICI_status"] = "+ICI/SD"
          

In [None]:
# add condition variable
adata.obs["Condition"] = np.where(adata.obs["timepoint"] == "SCR", "T0/-ICI", "T1/+ICI")
adata.obs["Timepoint"] = np.where(adata.obs["timepoint"] == "SCR", "T0", "T1")


In [None]:
adata.obs

In [None]:
# save anndata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

### Visualization

In [None]:
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "main_analysis")
sc.set_figure_params(dpi=300, dpi_save=300, format='png', fontsize=7)
sc.set_figure_params(figsize=(6, 6))

In [None]:
# cell type UMAP
sc.pl.umap(
        adata,
        color="Annotation_1.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=True,
        use_raw=False,
        title="Annotation 1.0",
        legend_loc="on data",
        save="Annotation_1.0.png",
        palette=["limegreen", "saddlebrown", "tomato", "red", "gold", "hotpink", "royalblue", "orange", "darkgreen", "dimgray", "skyblue"]
    )

In [None]:
# visualize annotation
sc.pl.umap(
        adata,
        color="leiden_res0_5_1",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=True,
        use_raw=False,
        legend_loc="on data",
        save="Clustering_res0_5_1.png",
)

In [None]:
# define markers
markers = [ 
    "CD4",
    "CD8B",
    "KLRF1",
    "LYZ",
    "LILRA4",
    "TPSAB1",
    "CD79A",
    "JCHAIN",
    "EPCAM",
    "CLDN4",
    "COL1A1",
    "ALB"
]


In [None]:
sc.pl.umap(
        adata,
        color=markers,
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=True,
        use_raw=False,
        #legend_loc="on data",
        save="Markers.png",
        legend_loc=None,
        ncols=4
)

In [None]:
# known markers dotplot
sc.pl.dotplot(
    adata,
    groupby="leiden_res0_5_1",
    var_names=markers,
    use_raw=False,
    save="Main_cell_types_markers",
    cmap="Purples"
)


In [None]:
sc.pl.matrixplot(
    adata,
    groupby="leiden_res0_5_1",
    var_names=markers,
    use_raw=False,
    save="Main_cell_types_markers",
    cmap="Purples"
)

In [None]:
sc.tl.dendrogram(adata, groupby='leiden_res0_5_1')

In [None]:
# plot top 5 marker genes per cluster
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_res0_5_1",
    standard_scale="var",
    n_genes=3,
    key="dea_leiden_res0_5_1",
    use_raw=False,
    cmap="Purples",
    save="Cluster_Marker_Genes_Dotplot.png"
)

### Check Expression of CD80/86(CD28)-CTLA4, CD274(PDL1)-PDCD1(PD1)

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
# define ICI markers
ici_markers = ["CD80", "CD86", "CD28", "CTLA4", "CD274", "PDCD1"]

In [None]:
sc.pl.umap(
        adata,
        color=ici_markers,
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=True,
        use_raw=False,
        title=["CD80 (B7.1)", "CD86 (B7.2)", "CD28", "CTLA4", "CD274 (PDL1)", "PDCD1 (PD1)"],
        #legend_loc="on data",
        save="ICI_Markers.png",
        legend_loc=None,
        ncols=4
)

In [None]:
# visualize their expression via dotplot
sc.pl.dotplot(
    adata,
    groupby="Annotation_1.0",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    save="ICI_Markers_Dotplot.png"
)

In [None]:
# visualize their expression via dotplot
sc.pl.dotplot(
    adata,
    groupby="patient",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    save="ICI_Markers_Dotplot_ByPatient.png"
)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), gridspec_kw={"wspace": .3})

plot1 = sc.pl.dotplot(
    adata[adata.obs.timepoint == "SCR"],
    groupby="patient",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    show=False,
    title="T0/-ICI",
    ax=ax1
)

plot2 = sc.pl.dotplot(
    adata[adata.obs.timepoint == "C02"],
    groupby="patient",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    show=False,
    title="T1/+ICI",
    ax=ax2
)

plt.show()

In [None]:
sc.pl.dotplot(
    adata[adata.obs.timepoint == "C02"],
    groupby="patient",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
)

In [None]:
sc.pl.dotplot(
    adata,
    groupby="Prior_Annotation_2.0",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    save="ICI_Markers__Subtypes_Dotplot.png"

)

## Add Annotation 2.0 from TNK and Myeloid subsets

In [None]:
# read subsets anndata objects
adata_tnk = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))
adata_myeloid = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_Myeloid_annotated_18-04-24.h5ad"))

In [None]:
new_annotation = []
for cell in adata.obs_names.tolist():
    if cell in adata_tnk.obs_names.tolist():
        new_annotation.append(adata_tnk.obs["Annotation_2.0"][cell])
    elif cell in adata_myeloid.obs_names.tolist():
        new_annotation.append(adata_myeloid.obs["Annotation_2.0"][cell])
    else:
        new_annotation.append(adata.obs["Annotation_1.0"][cell])

adata.obs["Annotation_2.0"] = new_annotation

In [None]:
adata.obs["Annotation_2.0"].unique()

In [None]:
# check transfer 
sc.pl.umap(
        adata,
        color="Annotation_2.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,
        frameon=True,
        use_raw=False,
        title="Annotation 2.0",
        #legend_loc="on data",
        #palette=["limegreen", "saddlebrown", "tomato", "red", "gold", "hotpink", "royalblue", "orange", "darkgreen", "dimgray", "skyblue"]
    )

In [None]:
sc.pl.dotplot(
    adata,
    groupby="Annotation_2.0",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu"
)

## Explore MTRNR2L8 & AC105402.3 Expression

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6), gridspec_kw={"wspace": 1})

ax1_dict=sc.pl.dotplot(
    adata,
    groupby="patient",
    var_names=["MTRNR2L8", "AC105402.3", "KIF5C"],
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    show=False,
    ax=ax1
)
ax2_dict=sc.pl.dotplot(
    adata,
    groupby="Annotation_1.0",
    var_names=["MTRNR2L8", "AC105402.3", "KIF5C"],
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    show=False,
    ax=ax2
)
ax3_dict=sc.pl.dotplot(
    adata,
    groupby="sample",
    var_names=["MTRNR2L8", "AC105402.3", "KIF5C"],
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    show=False,
    ax=ax3
)

## Get marker genes of P08's cDC2 Cluster

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
# subset P08
adata = adata[adata.obs["patient"] == "08"].copy()
adata

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata, groupby="Annotation_2.0", method="wilcoxon", groups = ["cDC2"], use_raw=False, key_added="dea_Annotation_2.0"#, layer="rawcounts" # do not use raw!
)

In [None]:
# get marker genes
marker_genes = adata.uns["dea_Annotation_2.0"]["names"]["cDC2"].tolist()  
scores = adata.uns["dea_Annotation_2.0"]["scores"]["cDC2"].tolist()  
pvals = adata.uns["dea_Annotation_2.0"]["pvals"]["cDC2"].tolist() 
pvals_adj = adata.uns["dea_Annotation_2.0"]["pvals_adj"]["cDC2"].tolist()
logfoldchanges = adata.uns["dea_Annotation_2.0"]["logfoldchanges"]["cDC2"].tolist() 

    
# create a df for the current cluster's marker genes
df = pd.DataFrame({"Gene": marker_genes, "Score": scores, "Pval": pvals, "PvalAdj": pvals_adj, "Log2Fold": logfoldchanges})

# remove non signficant genes
df = df[df["PvalAdj"] < 0.01]
df = df[df["Pval"] < 0.01]
    
# include just positive log fold changes 
df = df[df["Log2Fold"] > 0.4]
    
# Sort df based on the final score in descending order
df = df.sort_values(by="Score", ascending=False)
    
print(df)

In [None]:
# save to excel
df.to_excel(os.path.join(work_dir, "data", "markers", "Patient_08", "P08_cDC2_marker_genes_14-05-24.xlsx"))

## Get df with cell, sample id & Annotation for Sam

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
# extract df from obs
df = adata.obs[["sample", "Annotation_1.0", "Annotation_2.0"]]
df

In [None]:
# format index names
df.index = [index[1:] for index in df.index]

In [None]:
# save to csv
df.to_csv(os.path.join(work_dir, "data", "SP_annotations.csv"))

## Thesis / Poster Plots

In [None]:
# set figure params
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"
fig_dir = os.path.join(work_dir, "figures", "TFM", "Fig1")
sc.settings.figdir = os.path.join(work_dir, "figures", "TFM", "Fig1")
sc.set_figure_params(dpi=120, dpi_save=600, format='pdf', frameon=False, figsize=(3,3))

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

In [None]:
# count number of cells
adata

In [None]:
# Basic Annotation UMAP
palette=["limegreen", "saddlebrown", "tomato", "red", "gold", "hotpink", "royalblue", "orange", "darkgreen", "dimgray", "skyblue"]

sc.pl.umap(
        adata,
        color="Annotation_1.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=False,
        use_raw=False,
        palette=palette,
        title="TME Cells (86552)",
        legend_fontsize=8,
        save = "_Main_Annotation.pdf"
)

In [None]:
ici_markers = ["CD80", "CD86", "CD28", "CTLA4", "CD274", "PDCD1"]
sc.pl.umap(
        adata,
        color=ici_markers,
        vmin=0,
        vmax="p99",  
        sort_order=True,  
        frameon=True,
        use_raw=False,
        title=["CD80 (B7.1)", "CD86 (B7.2)", "CD28", "CTLA4", "CD274 (PDL1)", "PDCD1 (PD1)"],
        #legend_loc="on data",
        save="ICI_Markers.pdf",
        legend_loc=None,
        colorbar_loc=None,
        cmap = "Reds",
        add_outline=False,
        ncols=2
)

In [None]:
# add new patient variable
patient_mapping = {
    '01': 'P01',
    '02': 'P02',
    '03': 'P03',
    '08': 'P04',
    '10': 'P05'
}
adata.obs['new_patient'] = adata.obs['patient'].map(patient_mapping)

In [None]:
ici_markers = ["CD80", "CD86", "CD28", "CTLA4", "CD274", "PDCD1"]

sc.pl.dotplot(
    adata,
    groupby="new_patient",
    var_names=ici_markers,
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    save="ICI_Markers.pdf",
    #figsize=(4,3)
)

In [None]:
sc.pl.dotplot(
    adata,
    groupby="Annotation_1.0",
    var_names=["PTPRC", "CD4", "CD8A", "KLRF1", "MS4A1", "CD68", "EPCAM", "COL1A1", "CLDN5", "ALB"],
    standard_scale="var", 
    use_raw=False,
    cmap="RdYlBu",
    #save="ICI_Markers.pdf",
    #figsize=(4,3)
)

In [None]:
markers = [ 
    "CD4",
    "CD8B",
    "KLRF1",
    "LYZ",
    "LILRA4",
    "TPSAB1",
    "CD79A",
    "JCHAIN",
    "EPCAM",
    "CLDN4",
    "COL1A1",
    "ALB"
]


In [None]:
sc.pl.matrixplot(
    adata,
    groupby="Annotation_1.0",
    var_names=markers,
    standard_scale="var", 
    use_raw=False,
    #cmap="RdYlBu",
    #save="ICI_Markers.pdf",
    #figsize=(4,3)
)

In [None]:
sc.tl.rank_genes_groups(
    adata, groupby="Annotation_1.0", method="wilcoxon", key_added="dea_Annotation_1.0", use_raw=False
)

In [None]:
# plot top 5 marker genes per cluster
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="Annotation_1.0",
    standard_scale="var",
    n_genes=3,
    key="dea_Annotation_1.0",
    use_raw=False,
    #cmap="Purples",
    #save="Cluster_Marker_Genes_Dotplot.png"
)

In [None]:
markers_dict = { 
    "Immune": ["PTPRC"],
    "T Cell": ["CD3D", "CD3E"],
    "CD4 T": ["CD4"],
    "CD8 T":  ["CD8A", "CD8B"],
    "NK": ["KLRF1", "NKG7", "GNLY"],
    "Myeloid": ["LYZ", "CD68", "CD14", "FCGR3A"],
    "Mast": ["TPSAB1"],
    "pDC": ["LILRA4", "IL3RA"],
    "Plasma": ["JCHAIN", "MZB1"],
    "B Cell": ["CD79A", "MS4A1"],
    "Tumor": ["EPCAM"],
    "Liver Epithelial": ["ALB", "CYP1A1"],
    "Endothelial": ["CLDN5", "PECAM1"],
    "Fibroblast": ["COL1A1", "COL3A1"],
    
}

In [None]:
# order cat values
order = ['CD4 T', 'CD8 T', 'NK', 'Myeloid', 'pDC', 'Plasma', 'B Cell', 'Tumor', 'Liver Epithelial', 'Endothelial', 'CAF']
adata.obs['Annotation_1.0'] = pd.Categorical(adata.obs['Annotation_1.0'], categories=order)

In [None]:
sc.pl.dotplot(adata, markers_dict, groupby='Annotation_1.0', dendrogram=False, use_raw=False,
                   standard_scale='var', smallest_dot=40, color_map='Reds', save="Marker_Genes_Dotplot2.pdf", figsize=(15,4))

In [None]:
adata.obs['sample'].value_counts()

In [None]:
adata.obs['Annotation_1.0'].value_counts()

In [None]:
28102/(28102+18375)

In [None]:
markers = ["CD4", "CD8B", "GZMB", "HAVCR2", "FOXP3", "HLA-DRA"]
sc.pl.umap(
        adata,
        color=markers,
        vmin=0,
        vmax="p99",  
        sort_order=True,  
        frameon=False,
        use_raw=False,        #legend_loc="on data",
        save="Markers.pdf",
        legend_loc=None,
        colorbar_loc=None,
        cmap = "inferno",
        add_outline=False,
        ncols=6
)