# Signatures by sample heatmap/dot plot
- make dotplot forFigure S1
- signatures are from the paper table S2, grouped to 3

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import seaborn as sn

In [None]:
import sys; sys.path.append("../resources/")
import dotplot_utils as cody_dot # copied from cody's resources/dotplot_utils.py into my directory

In [None]:
import anndata as ad
sys.path.append("../resources/scRNA/")
import zc_function as zc

In [None]:
# make output directory
import os
if not os.path.exists("scRNA_out"):
    os.mkdir("scRNA_out")

if not os.path.exists("scRNA_out/FFPE_scRNA/"):
    os.mkdir("scRNA_out/FFPE_scRNA/")

## load data and signature

In [None]:
dat_path = '/home/lucy/Dropbox (VU Basic Sciences)/immune_exclusion/h5ad_files/'
dat = sc.read( dat_path + "outer_combined_all4_dat.h5ad")

In [None]:
dat.obs.SampleId.unique()

In [None]:
sigs = pd.read_csv("./tableS2.csv", header = 0, index_col = None) # suppl. table 2 from the paper
sigs.head()

In [None]:
test_group = ['Cytotoxicity', 'Exhaustion']

In [None]:
#sigs.iloc[:, 8:15 ].head()

## define dictionaries for plotting

In [None]:
# copied from Cody's step3/scRNA_summaries.ipynb

cmap_dict = {
    # Tumor Type
    'SSL/HP':"#c4a4e1",'MSI-H':"#7a4fa3",'MSS':"#ffc101",'TA/TVA':"#fee799",'NL':"#1f77b4",
    # Tumor Location
    "Cecum":"#62a32e","Ascending":"#b0dc7d","Hepatic Flexure":"#ebf6db","Transverse":"#fbe6f1","Descending":"#eeadd4","Sigmoid":"#cf4191",
    # this one's global
    "nan":"#ffffff",
    # These are black and white for T and F
    "T":"#000000","F":"#ffffff",
}

In [None]:
patient_id_ls = list(dat.obs.SampleId.unique()) 
print( list(dat.obs.SampleId.unique()) )

In [None]:
tumor_type_dict = {'9142_s2':'MSS', '9142_s1':'MSI-H', 
                   '10096_s1':'MSI-H', '10096_s2':'MSS', '10096_s3':'MSS', '10096_s4':'MSI-H', 
                   '10180_01_s1': 'MSI-H', '10180_01_s2':'MSI-H', '10180_01_s3':'MSI-H', '10180_01_s4':'MSS', 
                   '10180_02_s1':'MSS', '10180_02_s2':'TA/TVA', '10180_02_s3':'MSS', '10180_02_s4':'MSS', 
                   '10284_s1':'MSI-H', '10284_s2':'MSS', '10284_s3':'MSS', '10284_s4':'MSI-H'}

In [None]:
# mapping sample ID to tumor type cmap
patient_color_dict = dict(zip( patient_id_ls , [cmap_dict[tumor_type_dict[s] ] for s in patient_id_ls]) )

## test dotplot with 2 signatures
- pass test

In [None]:
#dat.X.sum(axis = 1) #check if raw data

In [None]:
dat.raw = dat.copy()

In [None]:
zc.normalization(dat)

In [None]:
for i in range(len( test_group)): # score a subset of the signatures
    gl = sigs[test_group[i]].dropna()
    sc.tl.score_genes(dat, gene_list=gl, score_name= test_group[i], use_raw = False)

In [None]:
dat

In [None]:
#sc.pl.umap(dat, color = 'Cytotoxicity')

In [None]:
patient_color_dict

In [None]:
tumor_type = [tumor_type_dict[s] for s in dat.obs.SampleId]

In [None]:
dat.obs["Tumor_Type"] = tumor_type

In [None]:
cody_dot.cody_heatmap(
        dat,
        groupby='SampleId',
        features=test_group,
        cluster_vars=False,
        vars_dict=None,
        groupby_order=None,
        groupby_colordict=patient_color_dict,
        
        
        save=None,
        dpi=400,
        cmap="Greys",
        size_title="Fraction of spots\nin group (%)",
    )
#save="ST_out/plots_overview/{}_archetypes_dotplot.png".format(group_name)
#figsize=(width, archetypes_height)
#cluster_obs=True if groupby_order is None else False

## score all signatures

In [None]:
dat.raw= dat.copy()

In [None]:
zc.normalization(dat)

In [None]:
sigs.columns

separate signatures into 3 groups : g1, g2, g3

In [None]:
g1 = ['Cytotoxicity', 'Exhaustion', 'Cytokines', 'Chemokines', 'MHC', 'Fetal',
       'Stem', 'Metaplasia', 'Bacterial response', 'iCMS2_Up', 
       'iCMS3_Up', 'Proliferation', 'Translation']

In [None]:
len(g1)

In [None]:
g2 = [ 'T cell exhaustion', 'B cell', 'NK cell', 'Plasma cell',
       'T reg resting', 'T reg suppressive', 'T reg tissue homing',
       'T reg cytokines', 'T reg activation', 'TH1', 'TH2', 'TH17', 'TRM',
       'M1', 'M2', 'Costimulatory MYE', 'Stimulatory DCs',
       'IFN stimulated MYE', 'EMT', 'Senescence', 'SASP', 'IFN stimulated EPI',
       'Stress response', 'Fibrosis', 'T cell', 'Myeloid', 'Stroma', 'T reg',
       'T cell CD4', 'T cell CD8', 'cDC2', 'cDC1', 'Macrophage','Classical monocytes']

In [None]:
len(g2)

In [None]:
g3 = ['Cycle', 'Stress', 'Interferon', 'Hypoxia',
       'Metal', 'Mesenchymal', 'pEMT', 'Alveolar', 'Basal', 'Squamous',
       'Glandular', 'Ciliated', 'AC', 'OPC', 'NPC'] 

In [None]:
len(g3)

In [None]:
fond_dict = dict() # dictionary of signatures names as keys and genes found in the dataset's gene space as values

In [None]:

for i in range(len(g1)):
    c = g1[i] # column name
    gl = sigs[c].dropna() #gene list
    gl2 = gl[np.isin(gl, dat.var_names)] # make sure all genes are in the dataset's gene space
    
    print(f"scoring {c}")
    print(f"{len(gl2)}/{len(gl)} genes scored for this signature")
    fond_dict[c] = gl2 # save the genes in the dound dictionary
    
    sc.tl.score_genes(dat, gene_list=gl2, score_name=c, use_raw=False) 
    

group 2 signatures

In [None]:
for i in range(15):
    c = g2[i] # column name
    gl = sigs[c].dropna() #gene list
    gl2 = gl[np.isin(gl, dat.var_names)]
    
    print(f"scoring {c}")
    print(f"{len(gl2)}/{len(gl)} genes scored for this signature")
    fond_dict[c] = gl2
    
    sc.tl.score_genes(dat, gene_list=gl2, score_name=c, use_raw=False) 

In [None]:
dat

In [None]:
for i in range(15, len(g2)):
    c = g2[i] # column name
    gl = sigs[c].dropna() #gene list
    gl2 = gl[np.isin(gl, dat.var_names)]
    
    print(f"({i}) scoring {c}")
    print(f"{len(gl2)}/{len(gl)} genes scored for this signature")
    fond_dict[c] = gl2
    
    sc.tl.score_genes(dat, gene_list=gl2, score_name=c, use_raw=False) 

In [None]:
for i in range(len(g3)):
    c = g3[i] # column name
    gl = sigs[c].dropna() #gene list
    gl2 = gl[np.isin(gl, dat.var_names)]
    
    print(f"({i}) scoring {c}")
    print(f"{len(gl2)}/{len(gl)} genes scored for this signature")
    fond_dict[c] = gl2
    
    sc.tl.score_genes(dat, gene_list=gl2, score_name=c, use_raw=False) 

## encode sample ID with patient ID names and combine replicates of each sample

In [None]:
FFPE_tabel = pd.read_excel("./FFPE_metatable.xlsx", sheet_name='Sheet1', engine='openpyxl' , header = 0)

In [None]:
FFPE_tabel

In [None]:
#find replicate samples

In [None]:
FFPE_tabel[FFPE_tabel.duplicated(['Block ID'])]

In [None]:
FFPE_tabel[FFPE_tabel.duplicated(['Patient_label'])]

there are only two patient ID has duplicates, so find the duplicates' block ID and create a new patient ID col with suffix for different blocks

In [None]:
new_patID = FFPE_tabel['Patient_label'].copy()
new_patID.head()

In [None]:
FFPE_tabel[FFPE_tabel['Patient_label'] == 'PAT71397'] # they are from different blocks and should map to different plot IDs 

In [None]:
new_patID[9] = 'PAT71397_1'
new_patID[10] = 'PAT71397_2'

In [None]:
FFPE_tabel[FFPE_tabel['Patient_label'] == 'SG00001'] # they are replicates, should not change

In [None]:
FFPE_tabel['plot_id'] = new_patID

## make dot plot

In [None]:
tumor_type = [tumor_type_dict[s] for s in dat.obs.SampleId]

In [None]:
dat.obs["Tumor_Type"] = tumor_type

### map patient ID to tumor type colors

In [None]:
FFPE_tabel2 = FFPE_tabel[['plot_id', 'MSS status'] ] #a FFPE meta table with only unique plot_ids 
FFPE_tabel2.drop_duplicates()

make a cmap to map plot ID to MSS status color

In [None]:
patient_color_dict2 = dict(zip( list(FFPE_tabel2['plot_id']) , [ cmap_dict[i ] for i in list( FFPE_tabel2['MSS status'] ) ] ) )

In [None]:
patient_color_dict2

add plot id column to dat.obs

In [None]:
sample_plot_id_dict = dict(zip(list(FFPE_tabel["SampleId"]), list(FFPE_tabel["plot_id"]) ))

In [None]:
dat_plotId =[sample_plot_id_dict[s] for s in dat.obs.SampleId]

In [None]:
dat.obs['plot_id'] = dat_plotId

sort plot id to match Cody's orders

In [None]:
FFPE_tabel.sort_values(by=['MSS status', 'plot_id'], inplace=True)

In [None]:
FFPE_tabel[['plot_id','MSS status'] ]

In [None]:
print(list( FFPE_tabel['plot_id']) )

In [None]:
ordered_plot_id = ['PAT71397_1', 'PAT30884', 'PAT59600', 'PAT59667', 'PAT71397_2', 'PAT71662', 'PAT73899', 'PAT74143', 'SG00003', 'SG00004',
                   'PAT00222', 'PAT33430', 'PAT54273', 'PAT59460', 'PAT73458', 'SG00001', 'SG00001', 'SG00002']

### plot

In [None]:
cody_dot.cody_heatmap(
        dat,
        groupby='plot_id',
        features=g1,
        cluster_vars=False,
        vars_dict=None,
        groupby_order=ordered_plot_id,
        groupby_colordict=patient_color_dict2,
        
        figsize = [8,5],
        save=None,
        dpi=400,
        cmap="Greys",
        size_title="Fraction of spots\nin group (%)",
    )
#save="ST_out/plots_overview/{}_archetypes_dotplot.png".format(group_name)
#figsize=(width, archetypes_height)
#cluster_obs=True if groupby_order is None else False

In [None]:
cody_dot.cody_heatmap(
        dat,
        groupby='plot_id',
        features=g2,
        cluster_vars=False,
        vars_dict=None,
        groupby_order=ordered_plot_id,
        groupby_colordict=patient_color_dict2,
        
        figsize = [8,15],
        save=None,
        dpi=400,
        cmap="Greys",
        size_title="Fraction of spots\nin group (%)",
    )

In [None]:
cody_dot.cody_heatmap(
        dat,
        groupby='plot_id',
        features=g3,
        cluster_vars=False,
        vars_dict=None,
        groupby_order=ordered_plot_id,
        groupby_colordict=patient_color_dict2,
        
        figsize = [8,5],
        save=None,
        dpi=400,
        cmap="Greys",
        size_title="Fraction of spots\nin group (%)",
    )

In [None]:
sig_and_id = ['Cytotoxicity', 'Exhaustion', 'Cytokines', 'Chemokines', 'MHC', 'Fetal', 'Stem', 'Metaplasia', 'Bacterial response', 'iCMS2_Up', 'iCMS3_Up', 'Proliferation', 'Translation', 'Tumor_Type', 'T cell exhaustion', 'B cell', 'NK cell', 'Plasma cell', 'T reg resting', 'T reg suppressive', 'T reg tissue homing', 'T reg cytokines', 'T reg activation', 'TH1', 'TH2', 'TH17', 'TRM', 'M1', 'M2', 'Costimulatory MYE', 'Stimulatory DCs', 'IFN stimulated MYE', 'EMT', 'Senescence', 'SASP', 'IFN stimulated EPI', 'Stress response', 'Fibrosis', 'T cell', 'Myeloid', 'Stroma', 'T reg', 'T cell CD4', 'T cell CD8', 'cDC2', 'cDC1', 'Macrophage', 'Classical monocytes', 'Cycle', 'Stress', 'Interferon', 'Hypoxia', 'Metal', 'Mesenchymal', 'pEMT', 'Alveolar', 'Basal', 'Squamous', 'Glandular', 'Ciliated', 'AC', 'OPC', 'NPC', 'plot_id']

In [None]:
dot_plot_signature_df = dat.obs[sig_and_id].copy()

In [None]:
dot_plot_signature_df.head()

## save data

In [None]:
dot_plot_signature_df.to_csv("scRNA_out/FFPE_scRNA/dotplot_signature_score_df.csv", header = True, index = True)