In [2]:
############### run single-sample GSEA (ssGSEA) for cancer patient or organoid data ###############
import gseapy
import gseapy as gp
import scipy.stats as stat
import numpy as np
import time, os
import pandas as pd
from collections import defaultdict


with open("../utilities/pathway_utilities.py") as f:
    code = compile(f.read(), "../utilities/pathway_utilities.py", 'exec')
    exec(code, globals())
    
with open("../utilities/Preclinical_model_data.py") as b:
    code = compile(b.read(), "../utilities/Preclinical_model_data.py", 'exec')
    exec(code, globals())
         
with open("../utilities/patient_expression.py") as c:
    code = compile(c.read(), "../utilities/patient_expression.py", 'exec')
    exec(code, globals())
     
g2en=geneID2ensemble()
## INITIALIZE
#======================
# INITIALIZE PARAMETERS
source = 'Organoid' # 'organoid', 'TCGA'
cancer_type = 'LUAD'
testing_pathway_list = ['REACTOME']

#==================
# IMPORT EXPRESSION
print ('importing expression for %s, ' %source, time.ctime())

expDic = {} # { sample ID : { gene in uniprot : exp } }
expDic_geneID = {} # { sample ID : { gene : exp } }
geneList, sampleList = [], []

if source.lower() == 'organoid':
    expDic_geneID = parse_organoid_transcriptome(cancer_type)
#print(expDic_geneID)

if source.upper() == 'TCGA':
    expDic_geneID = parse_TCGA_log2_FPKM(cancer_type)

sampleList = expDic_geneID.keys()
sorted(sampleList)

for sample in expDic_geneID:
    geneList = expDic_geneID[sample].keys()

#========================================
# IMPORT PATHWAYS FOR ENRICHMENT ANALYSIS
print('importing pathways, ', time.ctime())
reactome = reactome_genes_ensemble() # { pathway : [ gene list ] }
pathwayDic = {'reactome':reactome} # 

#print(reactome)


## PRINT ssGSEA RESULTS

#===============
# MAKE DIRECTORY
fo_directory = 'Result_GSEA' 
dir_list = [cancer_type.upper(), source]
for d in dir_list:
    if os.path.isdir('%s/%s' %(fo_directory, d)) == False:
        os.mkdir('%s/%s' %(fo_directory, d))
        fo_directory = '%s/%s'%(fo_directory, d)

#=====================
# MAKE GSEA INPUT FILE
fiList = os.listdir(fo_directory)

# gene expression
if not 'expression.txt' in fiList:
    fo = open('%s/expression.txt' %(fo_directory), 'w')
    d ='\t'.join(['NAME', 'DESCRIPTION']) + '\t' + '\t'.join(sampleList)
    print(d, file=fo)
    for gene in geneList:
        tmp = [gene, 'na']
        for sample in sampleList:
            tmp.append(expDic_geneID[sample][gene])
        a='\t'.join(map(str, tmp))
        print(a, file=fo)
    fo.close()


#=======
# ssGSEA
for testing_pathway in testing_pathway_list:
    if testing_pathway.lower() in pathwayDic:
        print ('running ssGSEA for %s ... , ' %testing_pathway.lower(), time.ctime())
        # gene sets for ssGSEA
        gene_sets = {}
        pw_list = []
        for pw in pathwayDic[testing_pathway.lower()]:
            for ensemble in pathwayDic[testing_pathway.lower()][pw]:
                if not pw in gene_sets:
                    gene_sets[pw] = []
                gene_sets[pw].append(ensemble)
        pw_list = gene_sets.keys()

        fo = open('%s/%s.gmt' %(fo_directory, testing_pathway.lower()), 'w')
        for pw in pw_list:
            m = pw + '\t' + '\t'.join(gene_sets[pw])
            print(m, file=fo)
        fo.close()
        gene_sets=reactome_genes_ensemble()

        # ssGSEA
        ss = gp.ssgsea(data='%s/expression.txt'%(fo_directory), gene_sets='%s/%s.gmt' %(fo_directory, testing_pathway.lower()),outdir='%s/%s_ssgsea_result'%(fo_directory, testing_pathway.lower()),min_size=0,max_size=70000,sample_norm_method='rank',permutation_num=0,no_plot=True,scale=True,)


importing expression for Organoid,  Thu Mar 31 12:54:27 2022
importing pathways,  Thu Mar 31 12:55:10 2022
running ssGSEA for reactome ... ,  Thu Mar 31 12:55:12 2022
