# SERPENTINE -- Combined T & NK Cells Subclustering

## Environment Setup

In [None]:
# load packages
import sys
import scanpy as sc
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scvi
import seaborn as sns
import scipy
import decoupler as dc

In [None]:
import anndata2ri
import logging

import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
# import helper functions
from helper_functions import save_markers, compute_signature_score

In [None]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set up figure parameters
plt.rcParams['figure.figsize'] = (6.0, 4.0)
sc.settings.verbosity = 0
sc.settings.set_figure_params(
    dpi=300,
    facecolor="white",
    frameon=False,
)

In [None]:
# set up dirs
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"
fig_dir = os.path.join(work_dir, "figures", "combined", "TNK", "preprocessing/")
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "TNK", "preprocessing/")
sc.set_figure_params(dpi=300, dpi_save=300, format='png', figsize=(4, 4))

In [None]:
# read anndata object
adata_full = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TCR_full-integrated_annot_22-03-24.h5ad"))

## Data Preparation

### Full Object Visualization

In [None]:
adata_full

In [None]:
adata_full.obs

In [None]:
sc.pl.umap(
    adata_full,
    color=["Annotation_1.0", "Prior_Annotation_2.0", "timepoint", "project", "subproject", "sample"],
    legend_loc="right margin",
    legend_fontsize=7,
    ncols=2
)

### Clean Anndata Object

In [None]:
adata_full.uns_keys

In [None]:
# remove full non-informative metadata
adata_full.obs.drop(columns=['leiden_res0_25', 'leiden_res0_5', 'leiden_res0_75', 'leiden_res1', 'leiden_res0_5_1', 'scVI_Annotation_1.0', 'harmony_Annotation_1.0', 'scGen_Annotation_1.0'], inplace=True)

In [None]:
adata_full.var.drop(columns=['HVG_cell_ranger', 'HVG_seurat', 'HVG_seurat_v3', 'HVG_model', 'highly_variable', 'excl_hv', 'orig_highly_variable'], inplace=True)

In [None]:
obsm_rem = ['harmony_umap', 'scGen_corrected_latent', 'scGen_umap', 'scVI_umap', 'X_pca_harmony', 'X_scVI']
for obsm in obsm_rem:
    del adata_full.obsm[obsm]

In [None]:
del adata_full.varm['PCs']

In [None]:
obsp_rem = ['harmony_neighbors_connectivities', 'harmony_neighbors_distances', 'scGen_neighbors_connectivities', 'scGen_neighbors_distances', 'scVI_neighbors_connectivities', 'scVI_neighbors_distances']
for obsp in obsp_rem:
    del adata_full.obsp[obsp]

### Subset T & NK populations

In [None]:
adata = adata_full[adata_full.obs['Annotation_1.0'].isin(['CD4 T', 'CD8 T', 'NK'])]

In [None]:
# check number of cells
len(adata.obs_names)

In [None]:
# non TNK integrated visualization (whole obj integration)
sc.pl.umap(
    adata,
    color=["Annotation_1.0", "Prior_Annotation_2.0", "timepoint", "project", "subproject", "sample"],
    legend_loc="right margin",
    legend_fontsize=7,
    ncols=2,
    wspace=.5
)

## TNK-specific Preprocessing

### Normalization

In [None]:
# observe count dist
plt.figure(figsize=(6.0, 4.0))
sns.histplot(adata.obs["total_counts"], bins=100, kde=False)

In [None]:
# normalization
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
# log-transform the data
sc.pp.log1p(adata)

In [None]:
# visualize shifted logarithm distribution 
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(adata.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(adata.layers["logcounts"].sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("Shifted logarithm")
plt.show()

### Compute Highly Variable Genes

In [None]:
# find variable genes
sc.pp.highly_variable_genes(adata, batch_key="sample", flavor="seurat", n_top_genes=2000, subset = False, inplace = True)

In [None]:
print(adata.var.highly_variable.value_counts())

In [None]:
# select TCR & MT genes to don't take them into account for PCA

# select TCR genes 
TCR_prefixes = ["TRAC", "TRAJ", "TRAV",  "TRBC", "TRBD", "TRBJ", "TRBV",  "TRDC", "TRDD", "TRDJ", "TRDV",  "TRBC", "TRG", "TRGJ", "TRGV"]
TCR_genes = [gene_name for gene_name in adata.var_names if any(gene_name.startswith(prefix) for prefix in TCR_prefixes)]
print(TCR_genes[:10])

# select mitochondrial genes
MT_genes = [gene_name for gene_name in adata.var_names if gene_name.startswith("MT-")]
print(MT_genes[:10])

# merge unwanted genes
out_genes = TCR_genes + MT_genes #+ RP_genes #(include RP genes as there is a non-random fashion across cells) #NO MT or RP genes found in HVG

# check there are matches
len(set(adata.var_names) & set(out_genes)) > 0

In [None]:
# create boolean list indicating whether the genes are excluded as HVG
in_out_genes = [False] * len(adata.var_names)
for i in range(0, len(adata.var_names)-1):
    gene = adata.var_names[i]
    if gene in out_genes:
        in_out_genes[i] = True

adata.var["excl_hv"] = in_out_genes

len(out_genes)

In [None]:
# remove undesired genes as highly variable
for i in range(0, len(adata.var_names)-1):
    gene = adata.var_names[i]
    if ( (adata.var.loc[gene, "highly_variable"] == True) & (adata.var.loc[gene, "excl_hv"] == True) ):
        print(gene + " found in HVG -- removed!")
        adata.var.loc[gene, "highly_variable"] = False

In [None]:
print(adata.var.highly_variable.value_counts())

In [None]:
# check highly variable genes
highly_variable_genes = adata.var.index[adata.var['highly_variable']]
highly_variable_genes

### Run PCA

In [None]:
sc.tl.pca(adata, use_highly_variable = True)

In [None]:
# visualize loadings
sc.pl.pca_loadings(adata, include_lowest=True, components=[1, 2, 3, 4, 5])

In [None]:
# select optimal number of PCs
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50)

In [None]:
# calculate the percent of variation associated with each PC
pct = adata.obsm['X_pca'].std(axis=0) / np.sum(adata.obsm['X_pca'].std(axis=0)) * 100

# calculate cumulative percents for each PC
cumu = np.cumsum(pct)

# determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC is less than 5
co1 = np.where((cumu > 90) & (pct < 5))[0]
co1_index = co1[0] if len(co1) > 0 else None
print(co1_index)

# determine the difference between the variation of PC and subsequent PC
co2 = np.sort(np.where((pct[:-1] - pct[1:]) > 0.05)[0])[::-1] # before: 0.1
co2_index = co2[0] + 1 if len(co2) > 0 else None
print(co2_index)

# usually, we would choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
pcs = min(co1_index, co2_index) if co1_index is not None and co2_index is not None else None

print("PCs covering the majority of the variation:", pcs)

In [None]:
# Kaiser rule --> Keep PC with an eigenvalue of >=1

# extract eigenvalues from PCA results
eigenvalues = adata.uns['pca']['variance']

# filter eigenvalues greater than or equal to 1
eigenvalues_gt_1 = [val for val in eigenvalues if val >= 1]

# get the corresponding principal component numbers
pcs_gt_1 = [i+1 for i, val in enumerate(eigenvalues) if val >= 1]

print(eigenvalues)
print(max(pcs_gt_1))

In [None]:
# 2/3 Variance Explanation
print(cumu)
np.where((cumu > 100*2/3))[0][0]

In [None]:
# define number of PCs
n_pcs=35

### Compute Neighbors & Non Linear Dim. Reduction

In [None]:
sc.pp.neighbors(adata, n_pcs=n_pcs)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=["sample", "Annotation_1.0"], wspace=1.5)

In [None]:
# check technical sources of variation
sc.pl.umap(
    adata,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2
)

In [None]:
sc.pl.umap(
    adata,
    color=["patient", "timepoint"],
    vmax="p99",
    #legend_loc="on data",
    frameon=False,
    use_raw=False,
    ncols=2
)

### Integration (scVI)

In [None]:
# define integration vars
batch_key = "subproject"

In [None]:
# create object specific to scVI with just HVG
adata_scvi = adata[:, adata.var["highly_variable"]].copy()

In [None]:
print(adata.shape)
print(adata_scvi.shape)

In [None]:
# prepare object
scvi.model.SCVI.setup_anndata(adata_scvi, 
                              layer="rawcounts", 
                              batch_key=batch_key,
                              #continuous_covariate_keys=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts"],
                              #categorical_covariate_keys=["patient", "timepoint"]
                             )
adata_scvi

In [None]:
# create the model
model_scvi = scvi.model.SCVI(adata_scvi)
model_scvi

In [None]:
# visualize model
model_scvi.view_anndata_setup()

In [None]:
# find optimal number of epochs
max_epochs_scvi = np.min([round((20000 / adata_scvi.n_obs) * 400), 400])
max_epochs_scvi

In [None]:
# train the model
model_scvi.train()

In [None]:
# extract the embedding
adata_scvi.obsm["X_scVI"] = model_scvi.get_latent_representation() #just embedding used in further steps
adata_scvi.layers["scvi_normalized"] = model_scvi.get_normalized_expression(library_size=10e4) # would allow us to perforem DE

In [None]:
# transfer scVI latent space to the full anndata object
adata.obsm["X_scVI"] = model_scvi.get_latent_representation()

In [None]:
# save the model
model_scvi.save(os.path.join(work_dir, "data", "models", "Combined_SCR_C02_TNK_scVI_integration_model_11-04-24"), overwrite=True)

In [None]:
# batch-corrected visualization (full)
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata)
adata

In [None]:
# visualize integration
sc.pl.umap(adata, 
           color=["Annotation_1.0", 
                  "sample",  
                  "subproject", 
                  "project", 
                  "patient", 
                  "timepoint",
                  "Prior_Annotation_2.0"
                 ], 
           wspace=1, 
           ncols=2) 

In [None]:
# save integrated TNK adata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_scVI-integrated_11-04-24.h5ad"))

In [None]:
# identfy technical sources of variation
sc.pl.umap(
    adata,
    color=["pct_counts_mt", "pct_counts_ribo", "n_genes_by_counts", "total_counts", "doublet_score", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2,
    save="Techincal_Sources_of_Variation.png"
)

In [None]:
# visualize batches
sc.pl.umap(
    adata,
    color=["patient", "project", "timepoint", "subproject", "response", "sample"],
    vmax="p99",
    frameon=False,
    cmap="PuOr",
    use_raw=False,
    ncols=2,
    save="Batches.png"
)

In [None]:
# check specific T cell markers to evaluate integration
sc.pl.umap(
    adata,
    color=["PTPRC", "CD4", "CD8B", "CCR7", "HAVCR2", "FOXP3", "MKI67", "TRAV1-2", "KLRF1"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    #cmap="Viridis",
    use_raw=False
)

### Clustering

In [None]:
# perform clustering
sc.tl.leiden(adata, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata, key_added="leiden_res0_75", resolution=0.75)
sc.tl.leiden(adata, key_added="leiden_res1", resolution=1.0)

In [None]:
# visulize clustering
sc.pl.umap(
    adata,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res0_75", "leiden_res1"],
    legend_loc="on data"
)

In [None]:
# subcluster cluster 11 (Pro-like) & 12
sc.tl.leiden(adata, key_added="leiden_res0_75_1", resolution=0.08, restrict_to = ("leiden_res0_75", ["11"]))
sc.tl.leiden(adata, key_added="leiden_res0_75_1", resolution=0.2, restrict_to = ("leiden_res0_75_1", ["12"]))

In [None]:
# visulize clustering
sc.pl.umap(
    adata,
    color=["leiden_res0_75", "leiden_res0_75_1"],
    legend_loc="on data"
)

In [None]:
# save clustered TNK adata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_scVI-integrated_clustered_11-04-24.h5ad"))

In [None]:
# read clustered TNK adata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_scVI-integrated_clustered_11-04-24.h5ad"))

### Compute Marker Genes

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata, groupby="leiden_res0_75_1", method="wilcoxon", key_added="dea_leiden_res0_75_1", use_raw=False#, layer="rawcounts" # do not use raw!
)

In [None]:
# save marker genes
save_markers(adata, "leiden_res0_75_1", os.path.join(work_dir, "data", "markers", "Combined", "Combined_TNK_res0_75_markers_12-04-24.xlsx"))

In [None]:
# define marker genes
marker_genes = [

    "CD4", 
    "CD8A","CD8B", 
    "CCR7","SELL",
    "IL7R","CD27",
    "ITGAE","ZNF683",# "CX3CR1",
    "GZMK","GZMB", "IFNG", "TNF", "PRF1",
    "HAVCR2","PDCD1", "CTLA4", "LAG3",
    "MKI67","TOP2A",
    "FOXP3",
    "TRDC","TRGC1",
    "TRAV1-2",
    "GNLY","KLRF1", "NCAM1", "FCGR3A"
    
]


In [None]:
# marker genes dictionary
marker_genes_dict = {

    "CD4 T": ["CD4"], 
    "CD8 T": ["CD8A","CD8B"], 
    "Naive": ["CCR7","SELL"],
    "Memory": ["IL7R","CD27"],
    "TRM": ["ITGAE","ZNF683"],
    "Effector": ["GZMK","GZMB", "IFNG", "TNF", "PRF1"],
    "Exhaustion": ["HAVCR2","PDCD1", "CTLA4", "LAG3"],
    "Proliferation": ["MKI67","TOP2A"],
    "Treg": ["FOXP3"],
    "γδ T": ["TRDC","TRGC1"],
    "MAIT": ["TRAV1-2"],
    "NK": ["GNLY","KLRF1", "NCAM1", "FCGR3A"]
    
}

In [None]:
# plot the expression of marker genes
sc.pl.umap(
        adata,
        color=marker_genes,
        vmin=0,
        vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=False,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=True,
        use_raw=False,
        cmap="viridis",  # https://matplotlib.org/stable/tutorials/colors/colormaps.html
    )

In [None]:
# dotplot of marker genes
sc.pl.dotplot(
    adata,
    groupby="leiden_res0_75_1",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds"
)

In [None]:
sc.pl.matrixplot(
    adata,
    groupby="leiden_res0_75_1",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds"
)

In [None]:
# plot top genes from marker genes computed before
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_res0_75_1",
    standard_scale="var",
    n_genes=5,
    key="dea_leiden_res0_75_1",
    use_raw=False
)

### Remove Noise Clusters

In [None]:
sc.pl.umap(
    adata,
    color=["leiden_res0_75", "leiden_res0_75_1", "JCHAIN", "pct_counts_mt"],
    legend_loc="on data",
    use_raw=False
)

In [None]:
# keep non filtered object
adata_nofilt = adata.copy()

In [None]:
# save nonfiltered object
adata_nofilt.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_scVI-integrated_clustered_nofilt_11-04-24.h5ad"))

In [None]:
# remove noise clusters
adata = adata[~adata.obs['leiden_res0_75_1'].isin(["11,1","14"])].copy()

In [None]:
# check if removal was successfull
set(adata.obs.leiden_res0_75_1)

In [None]:
# save filtered object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_int_clust_filt_11-04-24.h5ad"))

In [None]:
# read filtered object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_int_clust_filt_11-04-24.h5ad"))

### Recompute Neighbors & Clustering

In [None]:
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata)

In [None]:
# check specific T cell markers to evaluate embedding
sc.pl.umap(
    adata,
    color=["PTPRC", "CD4", "CD8B", "CCR7", "HAVCR2", "FOXP3", "MKI67", "TRAV1-2", "KLRF1"],
    vmax="p99",
    legend_loc="on data",
    frameon=False,
    #cmap="Viridis",
    use_raw=False
)

In [None]:
# perform clustering
sc.tl.leiden(adata, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata, key_added="leiden_res0_75", resolution=0.75)
sc.tl.leiden(adata, key_added="leiden_res1", resolution=1.0)

In [None]:
sc.tl.leiden(adata, key_added="leiden_res1_2", resolution=1.2)
sc.tl.leiden(adata, key_added="leiden_res1_4", resolution=1.4)
sc.tl.leiden(adata, key_added="leiden_res1_6", resolution=1.6)
sc.tl.leiden(adata, key_added="leiden_res1_8", resolution=1.8)
sc.tl.leiden(adata, key_added="leiden_res2", resolution=2)

In [None]:
# visulize clustering
sc.pl.umap(
    adata,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res0_75", "leiden_res1",
          "leiden_res1_2", "leiden_res1_4", "leiden_res1_6", "leiden_res1_8", "leiden_res2"],
    legend_loc="on data"
)

In [None]:
# choose resolution 1 - subcluster cluster 11, 
# subcluster cluster 15 (naive) as there is a fraction of CD8 T cells, 
# also cluster 8 as there might be naive + helper CD4 Ts
# sucluster 13 (T pro) to get CD4 and CD8 subsets (also identified a cycling γδ T subset)
sc.tl.leiden(adata, key_added="leiden_res1_1", resolution=0.25, restrict_to = ("leiden_res1", ["11"]))
sc.tl.leiden(adata, key_added="leiden_res1_1", resolution=0.2, restrict_to = ("leiden_res1_1", ["15"]))
sc.tl.leiden(adata, key_added="leiden_res1_1", resolution=0.2, restrict_to = ("leiden_res1_1", ["8"]))
sc.tl.leiden(adata, key_added="leiden_res1_1", resolution=0.2, restrict_to = ("leiden_res1_1", ["13"]))

In [None]:
# visulize clustering
sc.pl.umap(
    adata,
    color=["leiden_res1", "leiden_res1_1"],
    legend_loc="on data"
)

In [None]:
# save reclustered TNK adata object
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_int_filt_reclust_11-04-24.h5ad"))

In [None]:
# read reclustered TNK adata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_int_filt_reclust_11-04-24.h5ad"))

## Level 2 Annotation

### Map Known Marker Genes

In [None]:
# plot the expression of marker genes
sc.pl.umap(
        adata,
        color=marker_genes,
        vmin=0,
        vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=True,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=True,
        use_raw=False,
        cmap="viridis",  # https://matplotlib.org/stable/tutorials/colors/colormaps.html
        save="Markers_UMAPs.png"
    )

In [None]:
# dotplot of marker genes
sc.pl.dotplot(
    adata,
    groupby="leiden_res1_1",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds",
    save="Clusters_Markers_Dotplot.png"
)

In [None]:
sc.pl.matrixplot(
    adata,
    groupby="leiden_res1_1",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds",
    save="Clusters_Markers_Matrixplot.png"
)

### Compute Marker Genes

In [None]:
# compute marker genes
sc.tl.rank_genes_groups(
    adata, groupby="leiden_res1_1", method="wilcoxon", key_added="dea_leiden_res1_1", use_raw=False#, layer="rawcounts" # do not use raw!
)

In [None]:
# save marker genes
save_markers(adata, "leiden_res1_1", os.path.join(work_dir, "data", "markers", "Combined", "Combined_TNK_res1_markers_12-04-24.xlsx"))

In [None]:
sc.tl.dendrogram(adata, groupby='leiden_res1_1')

In [None]:
# plot top 5 marker genes per cluster
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_res1_1",
    standard_scale="var",
    n_genes=5,
    key="dea_leiden_res1_1",
    use_raw=False,
    save="Cluster_Marker_Genes_Dotplot.png"
)

### Carry Out Annotation

In [None]:
#sc.set_figure_params(figsize=(4, 4))
sc.pl.umap(
    adata,
    color=["leiden_res1_1"],
    legend_loc="on data",
    save="Clustering_res1_1_UMAP.png"
)

In [None]:
annotation = {
    
    "0":         "CD8 T Effector",           #1      
    "1":         "CD4 T Central Memory",     #2 top1 marker IL7R, how to differentiate between T Naive and CM 
    "2":         "CD56hi CD16lo NK",         #3
    "3":         "CD8 T TRM PreExhausted",   #4 # effector function + exhausted markers ?
    "4":         "CD4 T CM/EarlyActivated",  #5 Early activation genes: CD40LG, LTB, FOS/B, TNF + IL7R (memory)
    "5":         "CD8 T TRM Effector",       #6
    "6":         "CD4 T Regulatory",         #7
    "7":         "CD8 T TRM Exhausted",      #8 
    "8,0":       "CD4 T Follicular Helper",   #9  helper effector functions + exhaustion markers -follicular
    "8,1":       "CD4 T Follicular Helper",        #10  specially follicular helper realted genes
    "8,2":       "CD4 T Helper-like",     #11 many markers of exhaustion
    "8,3":       "CD4 T ISG+",               #12 inteferon-stimulated genes (IFIT, MX, OAS)
    "9":         "CD8 T MAIT",               #13
    "10":        "NKT-like",                 #14 CD8 expression + NK markers
    "11,0":      "CD8 T TRM Effector",       #15
    "11,1":      "CD4 T Regulatory",         #16 there might be a few cd8 t cells in it
    "11,2":      "γδ T-like",                #17
    "11,3":      "NOISE",                    #18 pro myeloid cells
    "12":        "CD56dim CD16hi NK",        #19
    "13,0":      "CD8 T Proliferative",      #20 
    "13,1":      "CD8 T Proliferative",      #21
    "13,2":      "Cycling γδ T-like",        #22
    "13,3":      "CD4 T Proliferative",      #23
    "14":        "γδ T-like",                #24 very high TRDC TRCG1, no exp of CD4/8, 
    "15,0":      "CD4 T Naive",              #25
    "15,1":      "CD8 T Naive"               #26
        
}

adata.obs["Annotation_2.0"] = adata.obs.leiden_res1_1.map(annotation)

In [None]:
# remove NOISE cluster
adata = adata[~adata.obs['Annotation_2.0'].isin(["NOISE"])].copy()

### Visualize Annotation 2.0

In [None]:
# visualize annotation
sc.set_figure_params(figsize=(3, 3))
sc.pl.umap(
    adata,
    color=["Annotation_2.0"],
    title=["Annotation 2.0"],
    frameon=True,
    cmap="tab20",
    save="Annotation_2.0_UMAP.png"
)

In [None]:
# dotplot of marker genes
sc.pl.dotplot(
    adata,
    groupby="Annotation_2.0",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds",
    save="Annotation_2.0_Dotplot.png"
)

In [None]:
sc.pl.matrixplot(
    adata,
    groupby="Annotation_2.0",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds",
    save="Annotation_2.0_Matrixplot.png"
)
#plt.savefig(os.path.join(fig_dir,"Annotation_2.0_Matrixplot.png"), dpi=600, format="png", bbox_inches="tight")

In [None]:
sc.pl.stacked_violin(
    adata,
    groupby="Annotation_2.0",
    var_names=marker_genes_dict,
    standard_scale="var", 
    use_raw=False,
    cmap="Reds",
    save="Annotation_2.0_Stacked_Violin.png"
)

In [None]:
# correlation matrix
sc.pl.correlation_matrix(adata, "Annotation_2.0", save="Annotation_2.0_Correlation_Matrix.png")

### Save Annotated Anndata Object

In [None]:
# save it
adata.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))

In [None]:
# read annotated object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))

## Gene Signatures Scores

In [None]:
# change saving figures dir
fig_dir = os.path.join(work_dir, "figures", "combined", "TNK", "signatures/")

In [None]:
# define signatures

signatures = {  #Single-cell transcriptomics of human T cells reveals tissue and activation signatures in health and disease

    "Exhaustion" : ["HAVCR2", "PDCD1", "CTLA4", "LAG3", "TIGIT", "TOX", "BATF", "ENTPD1", "CD274"],
    "Treg" : ["FOXP3", "CTLA4", "IRF4", "BATF", "TNFRSF18", "TOX2"],
    "CD4_Nv_CM_rest" : ["LEF1", "ATM", "SELL", "KLF2", "ITGA6"],
    "CD4_CD8_rest" : ["IL7R", "CD52", "S100A4", "AQP3", "NLRP3", "KLF2", "ITGB7"],
    "TRM" : ["CD69", "ITGAE", "ITGA1", "ZNF683", "CX3CR1"],
    "Early_Activation" : ["CD69", "IL2RA", "CD44", "SLC3A2", "TFRC", "FOS", "JUN", "ZAP70", "LAT"],
    #"IFN response" : ["IFIT3", "IFIT2", "STAT1", "MX1", "IRF7", "ISG15", "IFITM3", "OAS2", "JAK2", "SOCS1", "TRIM21"],
    #"Proliferation" : ["MKI67", "TOP2A", "PCNA", "LIF", "IL2", "CENPV", "NME1", "FABP5", "ORC6", "GOS2", "GCK"],
    "CD8_cytotoxic" : ["CCL5", "GZMK", "GNLY", "EOMES", "ZNF683", "KLRG1", "NKG7", "ZEB2"],
    "CD8_cytokine" : ["CCL3", "CCL4", "XCL2", "IL10", "PRF1", "TNFRSF9", "NKG7", "IL26", "IFNG", "XCL1", "CSF2", "HOPX", "LAG3"],
    "IFN_response" : ["APOL1", "APOL6", "BATF2", "BST2", "C5orf56", "CMPK2", "DDX58", "DDX60", "DHX58",
        "DTX3L", "EPSTI1", "FBXO6", "GBP1", "GBP4", "HELZ2", "HERC5", "HERC6", "HSH2D", "IFI16",
        "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "IFITM1",
        "IRF7", "IRF9", "ISG15", "LAMP3", "LAP3", "MX1", "MX2", "OAS2", "OAS3", "OASL", "PARP10",
        "PARP12", "PARP14", "PARP9", "PHF11", "PML", "PSMB9", "RNF213", "RSAD2", "RTP4", "SAMD9",
        "SAMD9L", "SHISA5", "SIGLEC1", "SP110", "STAT1", "STAT2", "TAP1", "TRAFD1", "TRIM21", "TRIM22",
        "TRIM5", "UBE2L6", "USP18", "XAF1", "ZNFX1"],
    "Proliferation" : ["ANLN", "ASPM", "AURKA", "AURKB", "BIRC5", "BUB1", "BUB1B", "CCNA2", "CCNB1",
        "CCNB2", "CCNE2", "CDC20", "CDC6", "CDCA2", "CDCA3", "CDCA5", "CDCA7", "CDCA8", "CDK1",
        "CDKN3", "CDT1", "CENPA", "CENPE", "CENPF", "CENPL", "CEP55", "CKS1B", "DEPDC1", "DEPDC1B",
        "DLGAP5", "DONSON", "DTL", "E2F8", "ECT2", "EZH2", "FAM72C", "FANCI", "FBXO5", "FOXM1",
        "GINS1", "GINS2", "GMNN", "HJURP", "HMGB3", "HMMR", "KIF11", "KIF14", "KIF15",
        "KIF18B", "KIF20A", "KIF2C", "KIF4A", "MAD2L1", "MCM10", "MCM2", "MCM4", "MCM6", "MELK",
        "MKI67", "MND1", "MTFR2", "NCAPG", "NCAPG2", "NDC80", "NEK2", "NUF2", "NUSAP1", "OIP5",
        "PARPBP", "PBK", "PCNA", "PLK4", "POLE2", "POLQ", "PTTG1", "RACGAP1", "RAD51", "RAD51AP1",
        "RRM1", "RRM2", "SHCBP1", "SKA1", "SMC2", "SPC25", "STIL", "STMN1", "TCF19", "TK1", "TOP2A",
        "TPX2", "TRIP13", "TTK", "TYMS", "UBE2C", "UHRF1", "ZWILCH", "ZWINT"],
    "Translation" : ["EEF1A1", "EEF1B2", "EEF1D", "EEF1G", "EIF3D", "EIF3E", "EIF3F", "EIF3G", "EIF3H", "EIF3K",
        "FAU", "NACA", "PFDN5", "RPL10", "RPL11", "RPL12", "RPL13", "RPL13A",
        "RPL14", "RPL15", "RPL17", "RPL18", "RPL18A", "RPL19", "RPL21", "RPL22", "RPL23", "RPL23A",
        "RPL24", "RPL27", "RPL27A", "RPL28", "RPL29", "RPL3", "RPL30", "RPL31", "RPL32", "RPL34",
        "RPL35", "RPL35A", "RPL36A", "RPL37", "RPL37A", "RPL38", "RPL39", "RPL4", "RPL5", "RPL6",
        "RPL7", "RPL7A", "RPL8", "RPL9", "RPLP0", "RPLP2", "RPS10", "RPS11", "RPS13", "RPS14", "RPS15",
        "RPS15A", "RPS16", "RPS17", "RPS18", "RPS19", "RPS2", "RPS20", "RPS21", "RPS23", "RPS25",
        "RPS27A", "RPS28", "RPS29", "RPS3", "RPS3A", "RPS5", "RPS6", "RPS7", "RPS8", "RPS9", "RPSA",
        "SNHG6", "SNHG8", "SNRPD2", "UXT"]
    
}

#https://www.nature.com/articles/s41467-019-12464-3?fromPaywallRec=false
# TIGER: http://tiger.canceromics.org/#/

In [None]:
signatures2 = {  #Immune Cell Gene Signatures for Profiling the Microenvironment of Solid Tumors

    "IFN response" : ["APOL1", "APOL6", "BATF2", "BST2", "C5orf56", "CMPK2", "DDX58", "DDX60", "DHX58",
        "DTX3L", "EPSTI1", "FBXO6", "GBP1", "GBP4", "HELZ2", "HERC5", "HERC6", "HSH2D", "IFI16",
        "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "IFITM1",
        "IRF7", "IRF9", "ISG15", "LAMP3", "LAP3", "MX1", "MX2", "OAS2", "OAS3", "OASL", "PARP10",
        "PARP12", "PARP14", "PARP9", "PHF11", "PML", "PSMB9", "RNF213", "RSAD2", "RTP4", "SAMD9",
        "SAMD9L", "SHISA5", "SIGLEC1", "SP110", "STAT1", "STAT2", "TAP1", "TRAFD1", "TRIM21", "TRIM22",
        "TRIM5", "UBE2L6", "USP18", "XAF1", "ZNFX1"],
    "Proliferation" : ["ANLN", "ASPM", "AURKA", "AURKB", "BIRC5", "BUB1", "BUB1B", "CCNA2", "CCNB1",
        "CCNB2", "CCNE2", "CDC20", "CDC6", "CDCA2", "CDCA3", "CDCA5", "CDCA7", "CDCA8", "CDK1",
        "CDKN3", "CDT1", "CENPA", "CENPE", "CENPF", "CENPL", "CEP55", "CKS1B", "DEPDC1", "DEPDC1B",
        "DLGAP5", "DONSON", "DTL", "E2F8", "ECT2", "EZH2", "FAM72C", "FANCI", "FBXO5", "FOXM1",
        "GINS1", "GINS2", "GMNN", "HJURP", "HMGB3", "HMMR", "KIF11", "KIF14", "KIF15",
        "KIF18B", "KIF20A", "KIF2C", "KIF4A", "MAD2L1", "MCM10", "MCM2", "MCM4", "MCM6", "MELK",
        "MKI67", "MND1", "MTFR2", "NCAPG", "NCAPG2", "NDC80", "NEK2", "NUF2", "NUSAP1", "OIP5",
        "PARPBP", "PBK", "PCNA", "PLK4", "POLE2", "POLQ", "PTTG1", "RACGAP1", "RAD51", "RAD51AP1",
        "RRM1", "RRM2", "SHCBP1", "SKA1", "SMC2", "SPC25", "STIL", "STMN1", "TCF19", "TK1", "TOP2A",
        "TPX2", "TRIP13", "TTK", "TYMS", "UBE2C", "UHRF1", "ZWILCH", "ZWINT"],
    "Translation" : ["EEF1A1", "EEF1B2", "EEF1D", "EEF1G", "EIF3D", "EIF3E", "EIF3F", "EIF3G", "EIF3H", "EIF3K",
        "FAU", "NACA", "PFDN5", "RPL10", "RPL11", "RPL12", "RPL13", "RPL13A",
        "RPL14", "RPL15", "RPL17", "RPL18", "RPL18A", "RPL19", "RPL21", "RPL22", "RPL23", "RPL23A",
        "RPL24", "RPL27", "RPL27A", "RPL28", "RPL29", "RPL3", "RPL30", "RPL31", "RPL32", "RPL34",
        "RPL35", "RPL35A", "RPL36A", "RPL37", "RPL37A", "RPL38", "RPL39", "RPL4", "RPL5", "RPL6",
        "RPL7", "RPL7A", "RPL8", "RPL9", "RPLP0", "RPLP2", "RPS10", "RPS11", "RPS13", "RPS14", "RPS15",
        "RPS15A", "RPS16", "RPS17", "RPS18", "RPS19", "RPS2", "RPS20", "RPS21", "RPS23", "RPS25",
        "RPS27A", "RPS28", "RPS29", "RPS3", "RPS3A", "RPS5", "RPS6", "RPS7", "RPS8", "RPS9", "RPSA",
        "SNHG6", "SNHG8", "SNRPD2", "UXT"]
    
}
#https://aacrjournals.org/cancerimmunolres/article/6/11/1388/468810/Immune-Cell-Gene-Signatures-for-Profiling-the


In [None]:
# compute and plot signatures

for signature in signatures:
    print(signature + " Done!")
    compute_signature_score(adata, gene_set=signatures[signature], score_name=signature+"_Score", palette="viridis", plot=False)


In [None]:
sc.pl.heatmap(adata, var_names=signatures2, groupby="Annotation_2.0", use_raw=False)

In [None]:
adata.obs.Condition

In [None]:
# filter out cell types not present in both timepoints in order to get paired pseudobulks
obs_df = adata.obs[['sample', 'Annotation_2.0']]
pivot_table = obs_df.pivot_table(index='Annotation_2.0', columns='sample', aggfunc='size', fill_value=0)
valid_cell_types = pivot_table[(pivot_table > 0).all(axis=1)].index
print(valid_cell_types)
adata_filtered = adata[adata.obs['Annotation_2.0'].isin(valid_cell_types)].copy()

In [None]:
# create pseudobulks
pdata = dc.get_pseudobulk(
    adata_filtered,
    sample_col='subproject',
    groups_col='Annotation_2.0',
    layer='rawcounts',
    mode='sum',
    #min_cells=10,
    #min_counts=1000
)
pdata

In [None]:
df_p[(df_p.Condition == "T0/-ICI")].shape == df_p[(df_p.Condition != "T0/-ICI")].shape

In [None]:
# normalize and sale pseudobulks
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)

In [None]:
# compute and plot signatures

for signature in signatures:
    print(signature + " Score Done!")
    compute_signature_score(pdata, gene_set=signatures[signature], score_name=signature+"_Score", palette="viridis", plot=False)
    compute_signature_score(adata, gene_set=signatures[signature], score_name=signature+"_Score", palette="viridis", plot=False)


In [None]:
df_p = pdata.obs
df_p

In [None]:
df = adata.obs
df

In [None]:
# non-responder df
df_pd = df[df['response'] == 'PD']
df_p_pd = df_p[df_p['response'] == 'PD']

In [None]:
t0

In [None]:
df_p

In [None]:
scores = list(signatures.keys())
print(scores)

for score in scores:

    from scipy.stats import mannwhitneyu, normaltest, ttest_ind, wilcoxon
    from statsmodels.stats.multitest import multipletests
    
    ############################
    # T0 vs. T1 - All Patients #
    ############################
    
    # statistical test (T0 bvs T1)
    print("T0 vs. T1 - All Patients")
    # create arrays of each condition
    t0 = df_p.loc[(df_p.Condition == "T0/-ICI"), score+"_Score"].values
    t1 = df_p.loc[(df_p.Condition == "T1/+ICI"), score+"_Score"].values    
    
    log_t0 = np.log(t0)
    log_t1 = np.log(t1)
    
    # pvalues with scipy:
    stat_results = [
      wilcoxon(t0, t1, alternative="two-sided")
    ]
    
    pvalues = [result.pvalue for result in stat_results]

    # Perform FDR correction using Benjamini-Hochberg procedure
    reject_null, corrected_p_values1, _, _ = multipletests(pvalues, method='fdr_bh')

    print(score + " Score:")
    print("T0/-ICI vs. T1/+ICI: \n", stat_results[0], "\n")
    print("Corrected p-value:", corrected_p_values1[0])

    ##################
    # T0 vs. T1 - PD #
    ##################

    # statistical test (T0 bvs T1)
    print("T0 vs. T1 - PD")
    # create arrays of each condition
    t0 = df_p_pd.loc[(df_p_pd.Condition == "T0/-ICI"), score+"_Score"].values
    t1 = df_p_pd.loc[(df_p_pd.Condition == "T1/+ICI"), score+"_Score"].values
    
    
    log_t0 = np.log(t0)
    log_t1 = np.log(t1)
    
    # pvalues with scipy:
    stat_results = [
      wilcoxon(t0, t1, alternative="two-sided")
    ]
    
    pvalues = [result.pvalue for result in stat_results]

    # Perform FDR correction using Benjamini-Hochberg procedure
    reject_null, corrected_p_values2, _, _ = multipletests(pvalues, method='fdr_bh')

    print(score + " Score:")
    print("T0/-ICI vs. T1/+ICI: \n", stat_results[0], "\n")
    print("Corrected p-value:", corrected_p_values2[0])
    
    #############
    # PD vs. SB #
    #############

    # statistical test (PD bvs SD)
    print("PD vs. SD")
    # create arrays of each condition
    t0 = df_p.loc[(df_p.response == "PD"), score+"_Score"].values
    t1 = df_p.loc[(df_p.response == "SD"), score+"_Score"].values
    
    
    log_t0 = np.log(t0)
    log_t1 = np.log(t1)
    
    # pvalues with scipy:
    stat_results = [
      mannwhitneyu(t0, t1, alternative="two-sided")
    ]
    
    pvalues = [result.pvalue for result in stat_results]

    # Perform FDR correction using Benjamini-Hochberg procedure
    reject_null, corrected_p_values3, _, _ = multipletests(pvalues, method='fdr_bh')

    print(score + " Score:")
    print("T0/-ICI vs. T1/+ICI: \n", stat_results[0], "\n")
    print("Corrected p-value:", corrected_p_values3[0])
    
    ##############################
    # -ICI vs. +ICI/PD & +ICI/SD #
    ##############################
   
    # create arrays of each condition
    t0 = df_p.loc[(df_p.ICI_status == "-ICI"), score+"_Score"].values
    pd = df_p.loc[(df_p.ICI_status == "+ICI/PD"), score+"_Score"].values
    sd = df_p.loc[(df_p.ICI_status == "+ICI/SD"), score+"_Score"].values
    
    log_t0 = np.log(t0)
    log_pd = np.log(pd)
    log_sd = np.log(sd)
    
    # pvalues with scipy:
    stat_results = [
      mannwhitneyu(t0, pd, alternative="two-sided"),
      mannwhitneyu(pd, sd, alternative="two-sided"),
      mannwhitneyu(t0, sd, alternative="two-sided")
    ]
    
    pvalues = [result.pvalue for result in stat_results]

    # Perform FDR correction using Benjamini-Hochberg procedure
    reject_null, corrected_p_values4, _, _ = multipletests(pvalues, method='fdr_bh')

    print("-ICI vs. +ICI/PD: \n", stat_results[0], "\n")
    print("Corrected p-value:", corrected_p_values4[0])
    print("+ICI/PD vs. +ICI/SD: \n", stat_results[1], "\n")
    print("Corrected p-value:", corrected_p_values4[1])
    print("-ICI vs. +ICI/SD: \n", stat_results[2], "\n")
    print("Corrected p-value:", corrected_p_values4[2])


    # logarize scores for better visualization
    constant_df = abs(df[score+'_Score'].min()) + 1
    constant_df_pd = abs(df_pd[score+'_Score'].min()) + 1
    df[score+'_Score_positive'] = df[score+'_Score'] + constant_df
    df_pd[score+'_Score_positive'] = df_pd[score+'_Score'] + constant_df_pd
    # Compute log2 after adding the constant
    df['log2_'+score+'_Score'] = np.log2(df[score+'_Score_positive'])
    df_pd['log2_'+score+'_Score'] = np.log2(df_pd[score+'_Score_positive'])

    #########
    # PLOTS #
    #########
    
    # Create subplots
    fig, axes = plt.subplots(1, 5, figsize=(18, 3)) 

    def p_value_to_symbol(p_value):
        if p_value <= 1.00e-4:
            return "****"
        elif p_value <= 1.00e-3:
            return "***"
        elif p_value <= 1.00e-2:
            return "**"
        elif p_value <= 5.00e-2:
            return "*"
        else:
            return "ns"
    


    # second subplot
        
    pairs = [("T0/-ICI", "T1/+ICI")]
    #axes[1]=get_log_ax()
    hue_plot_params = {
            "data":df, 
            "x":"Condition", 
            "y":'log2_'+score+'_Score',            
            "inner":"box", 
            "hue":"Condition", 
            "palette":["coral","darkviolet"], 
            "edgecolor":"black", 
            "alpha":0.7
    }
    sns.violinplot(ax=axes[0], **hue_plot_params)
    axes[0].set_title("Timepoint (PD+SD)")
    axes[0].set_ylabel('log2('+score+" Score)")
    axes[0].set_xlabel("")
    ymax = df['log2_'+score+'_Score'].max()
    axes[0].plot([0, 0, 1, 1], [ymax+.4, ymax+.5, ymax+.5, ymax+.4], lw=1, color='black')
    axes[0].text(.5, ymax+.5, f" {p_value_to_symbol(corrected_p_values1[0])}", ha='center', va='bottom', size=9)

    
    # third subplot
        
    pairs = [("T0/-ICI", "T1/+ICI")]
    #axes[1]=get_log_ax()
    hue_plot_params = {
            "data":df_pd, 
            "x":"Condition", 
            "y":'log2_'+score+'_Score',            
            "inner":"box", 
            "hue":"Condition", 
            "palette":["coral","darkviolet"], 
            "edgecolor":"black", 
            "alpha":0.7
    }
    sns.violinplot(ax=axes[1], **hue_plot_params)
    axes[1].set_title("Timepoint (PD)")
    axes[1].set_ylabel("")
    axes[1].set_xlabel("")
    ymax = df_pd['log2_'+score+'_Score'].max()
    axes[1].plot([0, 0, 1, 1], [ymax+.4, ymax+.5, ymax+.5, ymax+.4], lw=1, color='black')
    axes[1].text(.5, ymax+.5, f" {p_value_to_symbol(corrected_p_values2[0])}", ha='center', va='bottom', size=9)

    
    # forth subplot
    
    pairs = [("PD", "SD")]
    #axes[1]=get_log_ax()
    hue_plot_params = {
            "data":df, 
            "x":"response", 
            "y":'log2_'+score+'_Score',            
            "inner":"box", 
            "hue":"response", 
            "palette":["mistyrose", "lavender"], 
            "edgecolor":"black", 
            "alpha":0.7
    }
    sns.violinplot(ax=axes[2], **hue_plot_params)
    axes[2].set_title("Response")
    axes[2].set_ylabel("")
    axes[2].set_xlabel("")
    ymax = df['log2_'+score+'_Score'].max()
    axes[2].plot([0, 0, 1, 1], [ymax+.4, ymax+.5, ymax+.5, ymax+.4], lw=1, color='black')
    axes[2].text(.5, ymax+.5, f" {p_value_to_symbol(corrected_p_values3[0])}", ha='center', va='bottom', size=9)

    
    # fifth subplot
        
    # prepare significance annotation
    pairs = [("-ICI", "+ICI/PD"), ("+ICI/PD", "+ICI/SD"), ("-ICI", "+ICI/SD")]
    #axes[0]=get_log_ax()
    hue_plot_parameters = {
            "data":df, 
            "x":"ICI_status", 
            "y":'log2_'+score+'_Score', 
            "inner":"box", 
            "hue":"ICI_status", 
            "palette":["red", "blue", "snow"], 
            "edgecolor":"black",
            "alpha":0.7, 
    }
    sns.violinplot(ax=axes[3], **hue_plot_parameters)
    axes[3].set_title("ICI Status")
    axes[3].set_ylabel("")
    axes[3].set_xlabel("")
    ymax = df['log2_'+score+'_Score'].max()
    axes[3].plot([0, 0, 2, 2], [ymax+1, ymax+1.1, ymax+1.1, ymax+1], lw=1, color='black')
    axes[3].text(1, ymax+1.1, f" {p_value_to_symbol(corrected_p_values4[0])}", ha='center', va='bottom', size=9)
    axes[3].plot([0, 0, 1, 1], [ymax+.7, ymax+.8, ymax+.8, ymax+.7], lw=1, color='black')
    axes[3].text(0.5, ymax+.8, f" {p_value_to_symbol(corrected_p_values4[1])}", ha='center', va='bottom', size=9)
    axes[3].plot([1, 1, 2, 2], [ymax+0.4, ymax+.5, ymax+.5, ymax+0.4], lw=1, color='black')
    axes[3].text(1.5, ymax+.5, f" {p_value_to_symbol(corrected_p_values4[2])}", ha='center', va='bottom', size=9)

    # first plot
    sc.pl.umap(
            adata,
            color=score+"_Score",
            vmin=0,
            vmax="p99",  
            sort_order=True,  
            frameon=True,
            use_raw=False,
            cmap="viridis",
            ax=axes[4],
            title=score+" Score"
        )
    
    # Adjust layout
    plt.tight_layout()

    plt.show()
    
    # Show plot
    fig.savefig(os.path.join(fig_dir,score+"_Score_ICI_status_Condition_Violin_UMAP.pdf"), dpi=600, format="pdf", bbox_inches="tight")

## Thesis / Poster Plots

In [None]:
# set figure params
import os
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"
fig_dir = os.path.join(work_dir, "figures", "TFM", "Fig2")
sc.settings.figdir = os.path.join(work_dir, "figures", "TFM", "Fig2")
sc.set_figure_params(dpi=120, dpi_save=600, format='png', frameon=False, figsize=(3,3))

In [None]:
# read anndata object
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))

In [None]:
# count number of cells
adata

In [None]:
# Basic Annotation UMAP
sc.pl.umap(
        adata,
        color="Annotation_1.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=False,
        use_raw=False,
        palette="tab20c",
        title="T/NK Cells (54739)",
        legend_fontsize=8,
        legend_loc="right margin",
        save = "_TNK_Annotation_1.0.pdf"
)

In [None]:
# Basic Annotation UMAP
sc.pl.umap(
        adata,
        color="Annotation_2.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=False,
        use_raw=False,
        palette="tab20c",
        title="T/NK Cells (54739)",
        legend_fontsize=8,
        legend_loc="right margin",
        save = "_TNK_Annotation_2.0.pdf"
)

In [None]:
# Basic Annotation UMAP
sc.pl.umap(
        adata[adata.obs.timepoint == "SCR"],
        color="Annotation_2.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=False,
        use_raw=False,
        palette="tab20c",
        title="T/NK Cells (54739)",
        legend_fontsize=8,
        legend_loc="right margin",
        save = "_TNK_Annotation_2.0_T0.pdf"
)

In [None]:
# Basic Annotation UMAP
sc.pl.umap(
        adata[adata.obs.timepoint != "SCR"],
        color="Annotation_2.0",
        vmin=0,
        vmax="p99",  
        sort_order=False,  
        frameon=False,
        use_raw=False,
        palette="tab20c",
        title="T/NK Cells (54739)",
        legend_fontsize=8,
        legend_loc="right margin",
        save = "_TNK_Annotation_2.0_T1.pdf"
)

In [None]:
marker_genes_dict = {

    "CD4 T": ["CD4"], 
    "CD8 T": ["CD8A","CD8B"], 
    "Naive": ["CCR7","SELL"],
    "Memory": ["IL7R"],
    "TRM": ["ITGAE","ZNF683"],
    "Effector": ["IFNG", "TNF", "GZMK","GZMB", "PRF1"],
    "Exhaustion": ["HAVCR2","PDCD1", "CTLA4", "LAG3"],
    "Proliferation": ["MKI67","TOP2A"],
    "Treg": ["FOXP3"],
    "MAIT": ["TRAV1-2"],
    "NK": ["GNLY","KLRF1", "NCAM1", "FCGR3A"],
    "γδ T": ["TRDC","TRGC1"]
    
}

In [None]:
# order cat values
order = ['CD4 T Naive', 'CD4 T Central Memory', 'CD4 T CM/EarlyActivated', 'CD4 T ISG+', 'CD4 T Helper-like', 
         'CD4 T Follicular Helper', 'CD4 T Proliferative', 'CD4 T Regulatory', 'CD8 T Naive', 'CD8 T Effector',
        'CD8 T TRM Effector', 'CD8 T TRM PreExhausted', 'CD8 T TRM Exhausted', 'CD8 T Proliferative', 'CD8 T MAIT',
        'NKT-like', 'CD56dim CD16hi NK', 'CD56hi CD16lo', 'γδ T-like', 'Cycling γδ T-like']
adata.obs['Annotation_2.0'] = pd.Categorical(adata.obs['Annotation_2.0'], categories=order)

In [None]:
sc.pl.dotplot(adata, marker_genes_dict, groupby='Annotation_2.0', dendrogram=False, use_raw=False, swap_axes=False,
                   standard_scale='var', smallest_dot=40, color_map='Reds', save="Marker_Genes_Dotplot2.pdf", figsize=(16,6))

In [None]:
markers = ["CD4", "CD8B", "KLRF1", "FOXP3", "MKI67", "HAVCR2"]
sc.pl.umap(
        adata,
        color=markers,
        vmin=0,
        vmax="p99",  
        sort_order=True,  
        frameon=True,
        use_raw=False,
        #title=["CD80 (B7.1)", "CD86 (B7.2)", "CD28", "CTLA4", "CD274 (PDL1)", "PDCD1 (PD1)"],
        #legend_loc="on data",
        save="Markers.pdf",
        legend_loc=None,
        colorbar_loc=None,
        cmap = "Reds",
        add_outline=False,
        ncols=6, 
)