# Overview 

### Each topic outputs a set of genes with a score. Scanpy.tl.score_genes and the PCa cell signatures from H.Song were used to rank cell types within each topic.  The highest ranked genes and cell types per topic were used for futher analysis.

Imports 

In [1]:
import pandas as pd 
import scanpy as sc 
import numpy as np


Load  cell pie output - topics x genes 

In [2]:
iac_path = 'Results/genes_prostate_reproduce.h5ad' 
iac_adata = sc.read(iac_path)

Load PCa cell type signatures from H.Song - https://www.nature.com/articles/s41467-021-27322-4

In [3]:
def load_cell_type_signatures():

    # Load PCa signature 
    cell_type_signatures = pd.read_excel('Data/Cell_type_markers.xlsx', 'PCa signature') # SET PATH TO cell_type_markers 

    # Combine markers from ERGneg and ERGpos tumour to tumour 
    # Find union between markers 
    erg_neg = cell_type_signatures['ERGneg_Tumor'].dropna().values
    erg_pos = cell_type_signatures['ERGpos_Tumor'].dropna().values
    tumour = np.hstack((erg_pos,erg_neg))

    # Attach NAN to end to match length of other signatures 
    tumour = np.unique(tumour).reshape(1,-1)
    end_attch = np.array(['NaN'] * (205-93)).reshape(1,-1)

    # Combine and replace ERGpos Tumor, ERGneg Tumor with Tumor  in original df 
    tumour = np.hstack((tumour, end_attch))
    cell_type_signatures = cell_type_signatures.drop(['ERGpos_Tumor', 'ERGneg_Tumor'], axis=1)
    cell_type_signatures['Tumor'] = tumour[0]
    return cell_type_signatures


In [4]:
cell_type_signatures = load_cell_type_signatures()
cell_type_signatures

Unnamed: 0,BE,Club,LE,Endothelial,Fibroblast,Smooth_muscle,T-cells,Myeloid,Neurons,B-cells,Tumor
0,KRT15,MMP7,MSMB,VWF,DCN,MYH11,IL7R,IL1B,S100B,IGKC,A2M
1,KRT5,PIGR,ACPP,SELE,FBLN1,RGS5,TRBC2,HLA-DRA,NRXN1,CD79A,AGR2
2,KRT13,OLFM4,NEFH,IFI27,COL1A2,ACTA2,CCL5,HLA-DPA1,SOX2,CXCR4,ALDH1A3
3,SLC14A1,CP,ORM1,FLT1,IGF1,TAGLN,IFNG,HLA-DPB1,SOX10,CD37,ARL5B
4,DST,RARRES1,ANPEP,SPARCL1,C7,MYL9,CD8A,HLA-DRB1,,MS4A1,CACNA1D
...,...,...,...,...,...,...,...,...,...,...,...
200,,,,,,,,PLCB2,,,
201,,,,,,,,EMR2,,,
202,,,,,,,,C10orf54,,,
203,,,,,,,,SERPINA1,,,


Score each cell type within each topic using topic scored ranked genes 

In [5]:
def score_celltypes_per_topic(iac_adata, cell_type_signatures):
    ''' Uses topic score for each gene to rank cell types per topic'''
    
    # Cell type to be scored 
    cell_names = cell_type_signatures.columns.tolist()
    
    # Assign score for each cell type add to adata 
    for i in cell_names: 
        cell_genes = cell_type_signatures[i].values
        sc.tl.score_genes(iac_adata, cell_genes, score_name=i)
    
    return iac_adata 
    

In [6]:
iac_adata = score_celltypes_per_topic(iac_adata, cell_type_signatures)



  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):
  for cut in np.unique(obs_cut.loc[gene_list]):


Create excel file containing ranked cell types and genes per topic with score

In [7]:
def rank_celltype_in_topic(topic_n, adata):
    ''' Ranks cell types based of topic score '''
    print('TOPIC ', topic_n)
    topic_col = adata.obs.iloc[topic_n,1:].sort_values(ascending=False)
    topic_col.columns = ['sc.tl.score_gene']
    return topic_col

In [8]:
def find_top_n_markers(topic_marker_genes, topic, n_genes): 
    ''' Finds all or N top genes per topic '''
    topic_N = topic_marker_genes.iloc[:,[0,(topic+1)]]
    topic_N_ngenes = topic_N.sort_values(by='Topic_'+str(topic), ascending=False)
    topic_N_ngenes.columns = ['Genes', 'Topic score']
    return topic_N_ngenes

In [9]:
import pandas as pd
topic_path = 'Results/marker_genes_prostate_reproduce.csv' # SET PATH to marker_genes_prostate_cancer_invasive_9topics_svd_normalised.csv
def load_topic_csv(topic_path):
    # Load csv 
    topic_marker_genes = pd.read_csv(topic_path)
    
    # Add columns 
    topic_names = ['Topic_'+str(i) for i in range(9)]
    column_names = ['Genes']
    column_names.extend(topic_names)
    topic_marker_genes.columns = column_names
    return topic_marker_genes

topic_marker_genes = load_topic_csv(topic_path)

In [10]:
topic_marker_genes

Unnamed: 0,Genes,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8
0,KLK3,9.693339e+01,9.693339e-15,4.061924e-01,9.693339e-15,3.918281e+00,9.693339e-15,2.183140e+01,1.785393e-01,9.693339e-15
1,KLK2,5.379239e+01,9.693339e-15,9.693339e-15,9.693339e-15,2.093482e+00,9.693339e-15,1.208155e+01,9.693339e-15,9.693339e-15
2,EEF2,2.577244e+01,2.467340e+00,9.693339e-15,2.485429e+00,1.194174e+00,4.021616e-02,4.820307e+00,2.716696e+00,1.891999e+00
3,UBC,2.103491e+01,1.468043e+01,4.269452e+00,1.903673e+01,9.693339e-15,8.818307e-01,9.693339e-15,8.257946e+00,7.315742e+00
4,ACTB,1.379774e+01,2.356023e+01,2.920128e+00,1.329138e+01,9.693339e-15,1.058018e+00,9.693339e-15,7.687906e+00,6.508517e+00
...,...,...,...,...,...,...,...,...,...,...
15930,RASGEF1C,9.693339e-15,3.628410e-03,9.693339e-15,1.958293e-02,9.693339e-15,9.693339e-15,9.693339e-15,1.604830e-02,6.232896e-02
15931,DRD2,9.693339e-15,9.693339e-15,1.911181e-03,4.618055e-03,9.693339e-15,9.693339e-15,9.693339e-15,1.261923e-03,1.012017e-02
15932,ANKK1,9.693339e-15,9.693339e-15,1.844314e-04,4.255818e-03,9.693339e-15,9.693339e-15,3.089907e-04,9.693339e-15,9.693339e-15
15933,FAM169B,9.693339e-15,9.693339e-15,7.923150e-04,9.693339e-15,9.693339e-15,9.693339e-15,9.693339e-15,2.814452e-03,9.693339e-15


In [11]:
def results_output():
    ''' Creates two excel files to store results: 
    1. ranked_celltypes_per_topic.xlsx
    2. ranked_celltypes_per_topic.xlsx '''

    # Generate ranked / scored cell types and genes per topic 
    celltype_rank_dict = {} 
    top_markers_dict = {} 

    for i in range(9):
        celltype_rank_dict[i] = rank_celltype_in_topic(i,iac_adata)
        top_markers_dict[i] = find_top_n_markers(topic_marker_genes, i, n_genes=5)
    
    # Save to excel
    with pd.ExcelWriter('ranked_celltypes_per_topic_PC_prostate_reprod.xlsx') as writer:
        for i in range(9): 
            celltype_rank_dict[i].to_excel(writer, sheet_name='topic_' + str(i))

    with pd.ExcelWriter('ranked_genes_per_topic_PC_prostate_reprod.xlsx') as writer:
        for i in range(9): 
            top_markers_dict[i].to_excel(writer, sheet_name='topic_' + str(i))
    return 


In [12]:
results_output()

TOPIC  0
TOPIC  1
TOPIC  2
TOPIC  3
TOPIC  4
TOPIC  5
TOPIC  6
TOPIC  7
TOPIC  8
