# SERPENTINE -- Combined Myeloid Cells Subclustering

## Environment Setup

In [None]:
# load packages
import sys
import scanpy as sc
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scvi
import seaborn as sns
import scipy

In [None]:
import anndata2ri
import logging

import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
# import helper functions
from helper_functions import save_markers, compute_signature_score

In [None]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set up figure parameters
plt.rcParams['figure.figsize'] = (4.0, 4.0)
sc.settings.verbosity = 0
sc.settings.set_figure_params(
    dpi=600,
    facecolor="white",
    frameon=False,
)

In [None]:
sc.set_figure_params(figsize=(4, 4))

In [None]:
# set up dirs
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"
fig_dir = os.path.join(work_dir, "figures", "combined", "plasmaB", "preprocessing/")
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "plasmaB", "preprocessing/")
sc.set_figure_params(dpi=300, dpi_save=300, format='png')

In [None]:
# read anndata object
adata_full = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

## Data Preparation

### Full Object Visualization

In [None]:
adata_full

In [None]:
adata_full.obs

In [None]:
sc.pl.umap(
    adata_full,
    color=["Annotation_1.0", "timepoint", "project", "subproject", "sample"],
    legend_loc="right margin",
    legend_fontsize=7,
    ncols=2
)

### Clean Anndata Object

In [None]:
adata_full.uns_keys

In [None]:
# remove full non-informative metadata
adata_full.obs.drop(columns=['leiden_res0_25', 'leiden_res0_5', 'leiden_res0_75', 'leiden_res1', 'leiden_res0_5_1', 'scVI_Annotation_1.0', 'harmony_Annotation_1.0', 'scGen_Annotation_1.0'], inplace=True)

In [None]:
adata_full.var.drop(columns=['HVG_cell_ranger', 'HVG_seurat', 'HVG_seurat_v3', 'HVG_model', 'highly_variable', 'excl_hv', 'orig_highly_variable'], inplace=True)

In [None]:
obsm_rem = ['harmony_umap', 'scGen_corrected_latent', 'scGen_umap', 'scVI_umap', 'X_pca_harmony', 'X_scVI']
for obsm in obsm_rem:
    del adata_full.obsm[obsm]

In [None]:
del adata_full.varm['PCs']

In [None]:
obsp_rem = ['harmony_neighbors_connectivities', 'harmony_neighbors_distances', 'scGen_neighbors_connectivities', 'scGen_neighbors_distances', 'scVI_neighbors_connectivities', 'scVI_neighbors_distances']
for obsp in obsp_rem:
    del adata_full.obsp[obsp]

### Subset Myeloid populations

In [None]:
adata = adata_full[adata_full.obs['Annotation_1.0'].isin(['Plasma', 'B Cell'])]

In [None]:
# check number of cells
len(adata.obs_names)

In [None]:
# non Myeloid integrated visualization (whole obj integration)
sc.pl.umap(
    adata,
    color=["Annotation_1.0", "timepoint", "project", "subproject", "sample"],
    legend_loc="right margin",
    legend_fontsize=7,
    ncols=2,
    wspace=.5
)

## PlasmaB-specific Preprocessing

### Normalization

In [None]:
# observe count dist
plt.figure(figsize=(6.0, 4.0))
sns.histplot(adata.obs["total_counts"], bins=100, kde=False)

In [None]:
# normalization
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
# log-transform the data
sc.pp.log1p(adata)

In [None]:
# visualize shifted logarithm distribution 
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(adata.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(adata.layers["logcounts"].sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("Shifted logarithm")
plt.show()

### Compute Highly Variable Genes

In [None]:
# find variable genes
sc.pp.highly_variable_genes(adata, batch_key="sample", flavor="seurat", n_top_genes=2000, subset = False, inplace = True)

In [None]:
print(adata.var.highly_variable.value_counts())

In [None]:
# select BCR &  MT genes to don't take them into account for PCA

# select BCR genes 
BCR_prefixes = ["IGHC", "IGHD", "IGHJ", "IGHV",  "IGIC", "IGIJ", "IGIV",  "IGKC", "IGKJ", "IGKV", "IGLC", "IGLJ", "IGLV"]
BCR_genes = [gene_name for gene_name in adata.var_names if any(gene_name.startswith(prefix) for prefix in BCR_prefixes)]
print(BCR_genes[:10])

# select mitochondrial genes
MT_genes = [gene_name for gene_name in adata.var_names if gene_name.startswith("MT-")]
print(MT_genes[:10])

# merge unwanted genes
out_genes = MT_genes + BCR_genes

# check there are matches
len(set(adata.var_names) & set(out_genes)) > 0

In [None]:
# create boolean list indicating whether the genes are excluded as HVG
in_out_genes = [False] * len(adata.var_names)
for i in range(0, len(adata.var_names)-1):
    gene = adata.var_names[i]
    if gene in out_genes:
        in_out_genes[i] = True

adata.var["excl_hv"] = in_out_genes

len(out_genes)

In [None]:
# remove undesired genes as highly variable
for i in range(0, len(adata.var_names)-1):
    gene = adata.var_names[i]
    if ( (adata.var.loc[gene, "highly_variable"] == True) & (adata.var.loc[gene, "excl_hv"] == True) ):
        print(gene + " found in HVG -- removed!")
        adata.var.loc[gene, "highly_variable"] = False

In [None]:
print(adata.var.highly_variable.value_counts())

In [None]:
# check highly variable genes
highly_variable_genes = adata.var.index[adata.var['highly_variable']]
highly_variable_genes

### Run PCA

In [None]:
sc.tl.pca(adata, use_highly_variable = True)

In [None]:
# visualize loadings
sc.pl.pca_loadings(adata, include_lowest=True, components=[1, 2, 3, 4, 5])

In [None]:
# select optimal number of PCs
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50)

In [None]:
# calculate the percent of variation associated with each PC
pct = adata.obsm['X_pca'].std(axis=0) / np.sum(adata.obsm['X_pca'].std(axis=0)) * 100

# calculate cumulative percents for each PC
cumu = np.cumsum(pct)

# determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC is less than 5
co1 = np.where((cumu > 90) & (pct < 5))[0]
co1_index = co1[0] if len(co1) > 0 else None
print(co1_index)

# determine the difference between the variation of PC and subsequent PC
co2 = np.sort(np.where((pct[:-1] - pct[1:]) > 0.05)[0])[::-1] # before: 0.1
co2_index = co2[0] + 1 if len(co2) > 0 else None
print(co2_index)

# usually, we would choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
pcs = min(co1_index, co2_index) if co1_index is not None and co2_index is not None else None

print("PCs covering the majority of the variation:", pcs)

In [None]:
# Kaiser rule --> Keep PC with an eigenvalue of >=1

# extract eigenvalues from PCA results
eigenvalues = adata.uns['pca']['variance']

# filter eigenvalues greater than or equal to 1
eigenvalues_gt_1 = [val for val in eigenvalues if val >= 1]

# get the corresponding principal component numbers
pcs_gt_1 = [i+1 for i, val in enumerate(eigenvalues) if val >= 1]

print(eigenvalues)
print(max(pcs_gt_1))

In [None]:
# 2/3 Variance Explanation
print(cumu)
np.where((cumu > 100*2/3))[0][0]

In [None]:
# define number of PCs
n_pcs=35

### Compute Neighbors & Non Linear Dim. Reduction

In [None]:
sc.pp.neighbors(adata, n_pcs=n_pcs)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=["sample", "Annotation_1.0"], wspace=1.5)

In [None]:
# check technical sources of variation
sc.pl.umap(
    adata,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2
)

In [None]:
sc.pl.umap(
    adata,
    color=["patient", "timepoint"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    use_raw=False,
    ncols=2
)

### Integration (scVI)

In [None]:
# define integration vars
batch_key = "subproject"

In [None]:
# create object specific to scVI with just HVG
adata_scvi = adata[:, adata.var["highly_variable"]].copy()

In [None]:
print(adata.shape)
print(adata_scvi.shape)

In [None]:
# prepare object
scvi.model.SCVI.setup_anndata(adata_scvi, 
                              layer="rawcounts", 
                              batch_key=batch_key,
                              #continuous_covariate_keys=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts"],
                              #categorical_covariate_keys=["patient", "timepoint"]
                             )
adata_scvi

In [None]:
# create the model
model_scvi = scvi.model.SCVI(adata_scvi)
model_scvi

In [None]:
# visualize model
model_scvi.view_anndata_setup()

In [None]:
# find optimal number of epochs
max_epochs_scvi = np.min([round((20000 / adata_scvi.n_obs) * 400), 400])
max_epochs_scvi

In [None]:
# train the model
model_scvi.train()

In [None]:
# extract the embedding
adata_scvi.obsm["X_scVI"] = model_scvi.get_latent_representation() #just embedding used in further steps
adata_scvi.layers["scvi_normalized"] = model_scvi.get_normalized_expression(library_size=10e4) # would allow us to perforem DE

In [None]:
# transfer scVI latent space to the full anndata object
adata.obsm["X_scVI"] = model_scvi.get_latent_representation()

In [None]:
# save the model
model_scvi.save(os.path.join(work_dir, "data", "models", "Combined_SCR_C02_Myeloid_scVI_integration_model_11-04-24"), overwrite=True)

In [None]:
# batch-corrected visualization (full)
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata)
adata

In [None]:
# visualize integration
sc.pl.umap(adata, 
           color=["Annotation_1.0", 
                  "sample",  
                  "subproject", 
                  "project", 
                  "patient", 
                  "timepoint"
                 ], 
           wspace=1, 
           ncols=2) 

In [None]:
# save integrated myeloid adata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_plasmaB_scVI-integrated_25-04-24.h5ad"))

In [None]:
# read integrated myeloid adata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_plasmaB_scVI-integrated_25-04-24.h5ad"))

In [None]:
# identfy technical sources of variation
sc.pl.umap(
    adata,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2,
    save="Techincal_Sources_of_Variation.png"
)

In [None]:
# visualize batches
sc.pl.umap(
    adata,
    color=["patient", "project", "timepoint", "subproject", "response", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2,
    save="Batches.png"
)

### Clustering

In [None]:
# perform clustering
sc.tl.leiden(adata, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata, key_added="leiden_res0_75", resolution=0.75)
sc.tl.leiden(adata, key_added="leiden_res1", resolution=1.0)
sc.tl.leiden(adata, key_added="leiden_res1_2", resolution=1.2)
sc.tl.leiden(adata, key_added="leiden_res1_4", resolution=1.4)
sc.tl.leiden(adata, key_added="leiden_res1_6", resolution=1.6)
sc.tl.leiden(adata, key_added="leiden_res1_8", resolution=1.8)
sc.tl.leiden(adata, key_added="leiden_res2", resolution=2)

In [None]:
# visulize clustering
sc.pl.umap(
    adata,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res0_75", "leiden_res1",
          "leiden_res1_2", "leiden_res1_4", "leiden_res1_6", "leiden_res1_8", "leiden_res2"],
    legend_loc="on data"
)

In [None]:
# subcluster cluster 7
sc.tl.leiden(adata, key_added="leiden_res1_1", resolution=0.3, restrict_to = ("leiden_res1", ["7"]))


In [None]:
# visulize clustering
sc.set_figure_params(figsize=(6, 6))
sc.pl.umap(
    adata,
    color=["leiden_res1", "leiden_res1_1"],
    legend_loc="on data"
)

In [None]:
# save clustered myeloid adata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_plasmaB_scVI-integrated_clustered_25-04-24.h5ad"))

In [None]:
# read clustered myeloid adata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_plasmaB_scVI-integrated_clustered_25-04-24.h5ad"))

## Level 2 Annotation

### Map Known Marker Genes

In [None]:
# define marker genes
marker_genes = [

    "CD79A",     # B cell
    "MS4A1",     # B cell
    "ITGB2",      # B cell
    "CXCR5",     # Bf
    "TNFRSF13B", # Bf
    "CD22",      # Bf
    "POU2F1",   # GC B
    "CD40",      # GC B
    "SUGCT",     # GC B
    "CR2",       # Mem B
    "CD27",      # Mem B
    "MS4A1",     # Mem B
    "IGHM",      # Naive B
    "IGHD",      # Naive B
    "TCL1A",     # Naive B
    "CD24",      # Trans B 
    "MYO1C",     # Trans B
    "MS4A2",     # Trans B
    "MKI67",     # Pro GC B
    "SUGCT",     # Pro GC B
    "AICDA",     # Pro GC B
    #"MKI67",    # Cycling B
    "TOP2A",     # Cycling B
    "CD19",      # Cycling B
    "JCHAIN",    # Plasma
    "MZB1",      # Plasma
    "XBP1",      # Plasma
    #"MKI67",    # Plasmablast
    "SDC1",      # Plasmablast
    "XBP1",      # Plasmablast
    
    
]


In [None]:
# marker genes dictionary
marker_genes_dict = {
    
    "B Cell":               ["CD79A", "MS4A1", "ITGB2"],       
    "Follicular B":         ["CXCR5", "TNFRSF13B", "CD22"],     
    "GC B":    ["POU2F1", "CD40", "SUGCT"],       
    "Memory B":             ["CR2", "CD27", "MS4A1"],       
    "Naive B":              ["IGHM", "IGHD", "TCL1A"],       
    "Trans B":              ["CD24", "MYO1C", "MS4A2"],       
    "Pro GC B":             ["MKI67", "SUGCT", "AICDA"],       
    "Cycling B":            ["MKI67", "TOP2A", "CD19"],       
    "Plamsma":              ["JCHAIN", "MZB1", "XBP1"],       
    "Plasmablast":          ["MKI67", "SDC1", "XBP1"],       
            
}

In [None]:
# plot the expression of marker genes
sc.pl.umap(
        adata,
        color=marker_genes,
        vmin=0,
        vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=True,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=True,
        use_raw=False,
        cmap="viridis",  # https://matplotlib.org/stable/tutorials/colors/colormaps.html
        save="Markers_UMAPs.png"
    )

In [None]:
# dotplot of marker genes
sc.pl.dotplot(
    adata,
    groupby="leiden_res1_1",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Greens",
    save="Clusters_Markers_Dotplot.png"
)

In [None]:
sc.pl.matrixplot(
    adata,
    groupby="leiden_res1_1",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Greens",
    save="Clusters_Markers_Matrixplot.png"
)

### Compute Marker Genes

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata, groupby="leiden_res1_1", method="wilcoxon", key_added="dea_leiden_res1_1", use_raw=False#, layer="rawcounts" # do not use raw!
)

In [None]:
# save marker genes
save_markers(adata, "leiden_res1_1", os.path.join(work_dir, "data", "markers", "Combined", "Combined_plasmaB_res1_markers_25-04-24.xlsx"))

In [None]:
sc.tl.dendrogram(adata, groupby='leiden_res1_1')

In [None]:
# plot top 5 marker genes per cluster
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_res1_1",
    standard_scale="var",
    n_genes=5,
    key="dea_leiden_res1_1",
    use_raw=False,
    cmap="Blues",
    save="Cluster_Marker_Genes_Dotplot.png"
)

### Carry Out Annotation

In [None]:
sc.set_figure_params(figsize=(4, 4))
sc.pl.umap(
    adata,
    color=["leiden_res1_1"],
    legend_loc="on data",
    save="Clustering_res1_1_UMAP.png"
)

In [None]:
annotation = {
    
    "0":       "",
    "1":       "",
    "2":       "",
    "3":       "",
    "4":       "Plasma",
    "5":       "Plasma",
    "6":       "",
    "7,0":     "",
    "7,1":     "",
    "7,2":     "",
    "8":       "",
    "9":       "Plasma",
    "10":      "Plasma",
    "11":      "Plasma",

    
        
}

adata.obs["Annotation_2.0"] = adata.obs.leiden_res1_1.map(annotation)

In [None]:
# remove NOISE cluster
adata = adata[~adata.obs['Annotation_2.0'].isin(["NOISE"])].copy()

### Visualize Annotation 2.0

In [None]:
# visualize annotation
sc.set_figure_params(figsize=(4, 4))
sc.pl.umap(
    adata,
    color=["Annotation_2.0"],
    title=["Annotation 2.0"],
    frameon=True,
    cmap="tab20",
    save="Annotation_2.0_UMAP.png"
)

In [None]:
# dotplot of marker genes
sc.pl.dotplot(
    adata,
    groupby="Annotation_2.0",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Blues",
    save="Annotation_2.0_Dotplot.png"
)

In [None]:
sc.pl.matrixplot(
    adata,
    groupby="Annotation_2.0",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Blues",
    save="Annotation_2.0_Matrixplot.png"
)
#plt.savefig(os.path.join(fig_dir,"Annotation_2.0_Matrixplot.png"), dpi=600, format="png", bbox_inches="tight")

In [None]:
sc.pl.stacked_violin(
    adata,
    groupby="Annotation_2.0",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Blues",
    save="Annotation_2.0_Stacked_Violin.png"
)

In [None]:
# correlation matrix
sc.pl.correlation_matrix(adata, "Annotation_2.0", save="Annotation_2.0_Correlation_Matrix.png")

### Save Annotated Anndata Object

In [None]:
# save it
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_plasmaB_annotated_25-04-24.h5ad"))

In [None]:
# read annotated object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_plasmaB_annotated_25-04-24.h5ad"))