In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import scanpy as sc
# get genes from meta-set
meta_f = '/home/xlv0877/proj_home/ExPert/results/e48fbfce53513a9f/perturb_metaset.h5ad'
meta_adata = sc.read(meta_f, backed='r')



In [3]:
targets = meta_adata.obs['perturbation'].unique()

In [4]:
from biomart import BiomartServer

def get_ensembl_to_external(dataset="hsapiens_gene_ensembl"):
    # Connect to the BioMart server
    server = BiomartServer("http://www.ensembl.org/biomart")
    # Select the dataset
    mart = server.datasets[dataset]

    # Query to get Ensembl gene IDs for a list of gene names
    response = mart.search({
        'attributes': ["ensembl_gene_id", "external_gene_name"],
    })
    
    data = response.raw.data.decode('ascii')
    d = []
    for row in data.splitlines():
        line = row.split('\t')
        d.append([line[0], line[1]])
    gene_map = pd.DataFrame(d, columns=['ensembl_id', 'gene_name'])
    return gene_map

In [5]:
# get mapping of ensembl ids to external gene names
gene_map = get_ensembl_to_external()

In [None]:
gene_map.to_csv('/home/xlv0877/proj_home/dl/resources/ensembl_to_name.csv')

In [6]:
gene_map.head()

Unnamed: 0,ensembl_id,gene_name
0,ENSG00000210049,MT-TF
1,ENSG00000211459,MT-RNR1
2,ENSG00000210077,MT-TV
3,ENSG00000210082,MT-RNR2
4,ENSG00000209082,MT-TL1


In [34]:
gene_info = gene_map[gene_map['gene_name'].isin(targets)]

In [30]:
# get pathway info from reactome

In [123]:
import os
pw_info_f = '/home/xlv0877/proj_home/dl/resources/reactome_ensmbl2pw_all.tsv'

pw_info = pd.read_csv(pw_info_f, sep='\t', header=None)
pw_info.columns = ['gene', 'stId', 'url', 'pathway', 'cat', 'species']
pw_info = pw_info[pw_info.species=='Homo sapiens']

In [72]:
pw_gene = pw_info.merge(gene_info, left_on='gene', right_on='ensembl_id')

In [75]:
# get pathway hierachy
pw_h_f = '/home/xlv0877/proj_home/dl/resources/pw_relations.tsv'
pw_hierachy = pd.read_csv(pw_h_f, sep='\t', header=None)
pw_hierachy.columns = ['top_lvl', 'lower_lvl']
pw_gene = pw_gene.merge(pw_hierachy, left_on='stId', right_on='top_lvl')

In [78]:
# represent genes and associated pathways as boolean table
pw_gene_mat = pd.crosstab(pw_gene['gene_name'], pw_gene['top_lvl'])

In [196]:
pw_gene_mat.to_csv('/home/xlv0877/proj_home/dl/resources/gene_to_pw.csv')

In [5]:
pw_gene_mat = pd.read_csv('/home/xlv0877/proj_home/dl/resources/gene_to_pw.csv', index_col=0)

In [86]:
pw_gene_mat

top_lvl,R-HSA-109581,R-HSA-109582,R-HSA-109606,R-HSA-109703,R-HSA-109704,R-HSA-110313,R-HSA-110373,R-HSA-110381,R-HSA-111458,R-HSA-111461,...,R-HSA-9843745,R-HSA-9851695,R-HSA-9855142,R-HSA-9856651,R-HSA-9860931,R-HSA-9861718,R-HSA-9865114,R-HSA-9865118,R-HSA-991365,R-HSA-9917777
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AACS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZSCAN32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZSWIM8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZW10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZWILCH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between genes
cos_sim = cosine_similarity(pw_gene_mat)

# Convert the result to a DataFrame for better readability
cos_sim_df = pd.DataFrame(cos_sim, index=pw_gene_mat.index, columns=pw_gene_mat.index)

In [88]:
cos_sim_df

gene_name,A1BG,A2M,AAAS,AACS,AAK1,AAMP,AARS2,AASDHPPT,AATF,ABCA7,...,ZNRF1,ZNRF2,ZRSR2,ZSCAN10,ZSCAN25,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.00000,0.31806,0.005290,0.000000,0.000000,0.598453,0.000000,0.000000,0.000000,0.0000,...,0.056724,0.056724,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,0.000000,0.000000
A2M,0.31806,1.00000,0.000000,0.000000,0.000000,0.185294,0.000000,0.000000,0.000000,0.5913,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,0.000000,0.000000
AAAS,0.00529,0.00000,1.000000,0.127754,0.000000,0.106209,0.151674,0.126605,0.156219,0.0000,...,0.010236,0.010236,0.125757,0.000000,0.02155,0.02155,0.0,0.151068,0.162450,0.162450
AACS,0.00000,0.00000,0.127754,1.000000,0.000000,0.000000,0.000000,0.677739,0.000000,0.0000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,0.000000,0.000000
AAK1,0.00000,0.00000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.0000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.334481,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZSCAN32,0.00000,0.00000,0.021550,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,...,0.000000,0.000000,0.000000,0.000000,1.00000,1.00000,0.0,0.000000,0.000000,0.000000
ZSWIM8,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,...,0.000000,0.000000,0.000000,0.822854,0.00000,0.00000,1.0,0.000000,0.000000,0.000000
ZW10,0.00000,0.00000,0.151068,0.000000,0.334481,0.408640,0.000000,0.000000,0.619771,0.0000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,1.000000,0.929933,0.929933
ZWILCH,0.00000,0.00000,0.162450,0.000000,0.000000,0.439430,0.000000,0.000000,0.666469,0.0000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.929933,1.000000,1.000000


In [91]:
from sklearn.cluster import AgglomerativeClustering


# Convert cosine similarity to distance (distance = 1 - similarity)
cosine_distance_matrix = 1 - cos_sim_df

agg_clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.2)
agg_clust_labels = agg_clust.fit_predict(cosine_distance_matrix)

In [109]:
gene2cluster = pd.DataFrame({'gene': cos_sim_df.index, 'pathway': agg_clust_labels})

In [113]:
gene2cluster.pathway = 'pw_' + gene2cluster.pathway.astype(str)

In [114]:
gene2cluster.pathway.value_counts()

pathway
pw_391     307
pw_496     115
pw_1112     88
pw_238      85
pw_547      82
          ... 
pw_458       1
pw_926       1
pw_1887      1
pw_1029      1
pw_1028      1
Name: count, Length: 2303, dtype: int64

In [115]:
gene2cluster.to_csv('/home/xlv0877/proj_home/dl/resources/gene_to_pw_group.csv')

In [11]:
cos_sim_df

gene_name,A1BG,A2M,AAAS,AACS,AAK1,AAMP,AARS2,AASDHPPT,AATF,ABCA7,...,ZNRF1,ZNRF2,ZRSR2,ZSCAN10,ZSCAN25,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.000000,0.447214,0.000000,0.0,0.000000,0.298142,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
A2M,0.447214,1.000000,0.000000,0.0,0.000000,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
AAAS,0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.057735,0.0,0.0,0.0,0.0,0.089443,0.100000,0.100000
AACS,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
AAK1,0.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.129099,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZSCAN32,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,1.0,1.0,0.0,0.000000,0.000000,0.000000
ZSWIM8,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.000000,0.000000,0.000000
ZW10,0.000000,0.000000,0.089443,0.0,0.129099,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.894427,0.894427
ZWILCH,0.000000,0.000000,0.100000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.894427,1.000000,1.000000


In [116]:
cos_sim_df['GFM1']['MRPL35']

1.0

In [117]:
cos_sim_df['GFM1'].sort_values(ascending=False).head(10)

gene_name
ERAL1     1.0
PTCD3     1.0
MRPL3     1.0
MRPL39    1.0
MRPL38    1.0
MRPL4     1.0
MRPL22    1.0
MRPL23    1.0
MRPL24    1.0
MRPL27    1.0
Name: GFM1, dtype: float64

In [None]:
# how do I integrate this into my model??
# as a second modality? --> try to reconstruct this based on the data?
# or adjust the data based on this? --> introduce special classification loss?
# could be something novel to make it stand out from scanvi

In [131]:
meta_adata.obs[meta_adata.obs['perturbation']=='GFM1']['dataset'].value_counts()

dataset
ReplogleWeissman2022_rpe1              1625
ReplogleWeissman2022_K562_gwps          437
ReplogleWeissman2022_K562_essential     235
NormanWeissman2019_filtered               0
FrangiehIzar2021_RNA                      0
DatlingerBock2021                         0
ShifrutMarson2018                         0
TianKampmann2019_day7neuron               0
TianKampmann2019_iPSC                     0
TianKampmann2021_CRISPRa                  0
TianKampmann2021_CRISPRi                  0
Name: count, dtype: int64

In [104]:
meta_adata.obs.groupby('dataset', observed=True)['perturbation'].apply(lambda x: x.value_counts()[:5])

dataset                                           
DatlingerBock2021                    control           4166
                                     LAT_2             1697
                                     LCK_2             1652
                                     JUND_2            1363
                                     FOS_2             1347
FrangiehIzar2021_RNA                 control          54938
                                     ACTA2             1433
                                     B2M               1353
                                     A2M               1302
                                     AEBP1             1275
NormanWeissman2019_filtered          control          11706
                                     KLF1              1932
                                     BAK1              1435
                                     CEBPE_RUNX1T1     1198
                                     ETS2              1185
ReplogleWeissman2022_K562_essential  control     

In [95]:
len(set(cos_sim_df.index).difference(set(gene_info.gene_name)))

0

In [121]:
cos_sim_df

gene_name,A1BG,A2M,AAAS,AACS,AAK1,AAMP,AARS2,AASDHPPT,AATF,ABCA7,...,ZNRF1,ZNRF2,ZRSR2,ZSCAN10,ZSCAN25,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.000000,0.436436,0.035714,0.000000,0.000000,0.436436,0.000000,0.000000,0.000000,0.000000,...,0.188982,0.188982,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
A2M,0.436436,1.000000,0.000000,0.000000,0.000000,0.166667,0.000000,0.000000,0.000000,0.166667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
AAAS,0.035714,0.000000,1.000000,0.047246,0.000000,0.054554,0.094491,0.047246,0.042258,0.000000,...,0.047246,0.047246,0.094491,0.000000,0.054554,0.054554,0.0,0.118217,0.133631,0.133631
AACS,0.000000,0.000000,0.047246,1.000000,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
AAK1,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.208514,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZSCAN32,0.000000,0.000000,0.054554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.0,0.000000,0.000000,0.000000
ZSWIM8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.316228,0.000000,0.000000,1.0,0.000000,0.000000,0.000000
ZW10,0.000000,0.000000,0.118217,0.000000,0.208514,0.060193,0.000000,0.000000,0.093250,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.884652,0.884652
ZWILCH,0.000000,0.000000,0.133631,0.000000,0.000000,0.068041,0.000000,0.000000,0.105409,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.884652,1.000000,1.000000


In [5]:
import requests

In [6]:
def get_query(gq: str):
    response = requests.get(gq)
    if response.status_code == 200:
        return response.json()
    else:
        return {}

In [7]:
# query all genes at once or at least in batches
batch_size = 500

batches = [targets[i:i+batch_size] for i in np.arange(len(targets), step=batch_size)]

In [8]:
# try this for one batch

In [None]:
gene_info = []
for batch in tqdm(batches, desc='Retrieving gene info from reactome', unit='batch'):
    gene_list = ','.join(batch)
    gene_query = reactome_query + gene_list + base_filter_query + f'&rows={batch_size}'
    qr = get_query(gene_query)
    # collect results
    batch_results = []
    for entry in qr['results']:
        batch_results.extend(entry['entries'])
    gene_info.append(pd.DataFrame(batch_results))

Retrieving gene info from reactome:  81%|████████▏ | 26/32 [08:33<01:54, 19.02s/batch]

In [37]:
gene_info = pd.concat(gene_info, axis=0)

In [None]:
# fuck this shit and download data straight away?

In [16]:
gene_info = pd.concat(gene_info, axis=0)

Unnamed: 0,dbId,stId,id,name,exactType,species,referenceName,referenceIdentifier,compartmentNames,compartmentAccession,isDisease,databaseName,referenceURL,disease,icon,summation
0,6810129,R-HSA-6810129,R-HSA-6810129,"<span class=""highlighting"" >ZNF335</span>",ReferenceGeneProduct,[Homo sapiens],ZNF335,Q9H4Z2,[nucleoplasm],[0005654],False,UniProt,http://purl.uniprot.org/uniprot/Q9H4Z2,False,False,
1,2454159,R-HSA-2454159,R-HSA-2454159,"<span class=""highlighting"" >ZNF711</span>",ReferenceGeneProduct,[Homo sapiens],ZNF711,Q9Y462,[nucleoplasm],[0005654],False,UniProt,http://purl.uniprot.org/uniprot/Q9Y462,False,False,
2,8850912,R-HSA-8850912,R-HSA-8850912,"<span class=""highlighting"" >FBXL8</span>",ReferenceGeneProduct,[Homo sapiens],FBXL8,Q96CD0,[cytosol],[0005829],False,UniProt,http://purl.uniprot.org/uniprot/Q96CD0,False,False,
3,9727481,R-HSA-9727481,R-HSA-9727481,"<span class=""highlighting"" >SS18L1</span>",ReferenceGeneProduct,[Homo sapiens],SS18L1,O75177,[nucleoplasm],[0005654],False,UniProt,http://purl.uniprot.org/uniprot/O75177,False,False,
4,383382,R-HSA-383382,R-HSA-383382,"<span class=""highlighting"" >PPFIA3</span>",ReferenceGeneProduct,[Homo sapiens],PPFIA3,O75145,[cytosol],[0005829],False,UniProt,http://purl.uniprot.org/uniprot/O75145,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,9630911,R-HSA-9630911,R-HSA-9630911,p16INK4A M53I,ReferenceGeneProduct,[Homo sapiens],CDKN2A,P42771,[cytosol],[0005829],True,UniProt,http://purl.uniprot.org/uniprot/P42771,True,False,
174,9630916,R-HSA-9630916,R-HSA-9630916,p16INK4A P114L,ReferenceGeneProduct,[Homo sapiens],CDKN2A,P42771,[cytosol],[0005829],True,UniProt,http://purl.uniprot.org/uniprot/P42771,True,False,
175,9606323,R-HSA-9606323,R-HSA-9606323,MUTYH-3 A359S,ReferenceIsoform,[Homo sapiens],MUTYH,Q9UIF7,[nucleoplasm],[0005654],True,UniProt,http://purl.uniprot.org/uniprot/Q9UIF7-3,True,False,
176,5683326,R-HSA-5683326,R-HSA-5683326,ABCD4 Y319C,ReferenceGeneProduct,[Homo sapiens],ABCD4,O14678,[lysosomal membrane],[0005765],True,UniProt,http://purl.uniprot.org/uniprot/O14678,True,False,


In [70]:
# join with pathway info
stIds = ','.join(gene_info['stId'])

In [17]:
pw_result = get_query(pw_base_query + 'R-HSA-6810129' + pw_filter)

In [18]:
pw_result

[{'dbId': 5617472,
  'displayName': 'Activation of anterior HOX genes in hindbrain development during early embryogenesis',
  'stId': 'R-HSA-5617472',
  'stIdVersion': 'R-HSA-5617472.4',
  'isInDisease': False,
  'isInferred': False,
  'name': ['Activation of anterior HOX genes in hindbrain development during early embryogenesis'],
  'releaseDate': '2015-12-15',
  'speciesName': 'Homo sapiens',
  'doi': '10.3180/r-hsa-5617472.1',
  'hasDiagram': True,
  'hasEHLD': False,
  'lastUpdatedDate': '2021-09-15',
  'schemaClass': 'Pathway',
  'className': 'Pathway'}]

In [19]:
query_results = []
gene_pathways = {}

max_pws = 10
max_depth = 0
for gene in tqdm(targets, desc='Querying pathway info from reactome api', unit='gene'):
    gene_query = reactome_query + gene + base_filter_query
    qr = get_query(gene_query)
    g_entries = qr['results'][0]['entries']
    reactome_ids = [e['stId'] for e in g_entries]
    pathways = dict()
    for ri in reactome_ids:
        pw_result = get_query(pw_base_query + ri + pw_filter)
        if len(pw_result) == 0:
            continue
        pw_id = pw_result[0]['stId']
        pathways[pw_id] = pw_result[0]['displayName']
    depth = len(pathways)
    if depth == 0:
        pathways = {'pathway': gene}
    else:
        if depth > max_depth:
            max_depth = depth
    gene_pathways[gene] = pathways
    query_results.append(qr)

Querying pathway info from reactome api:   0%|          | 10/15639 [00:08<3:37:57,  1.20gene/s]


KeyError: 'results'

In [40]:
gene_pathways

{'FUCA2': {'R-HSA-6798695': 'Neutrophil degranulation',
  'R-HSA-381426': 'Regulation of Insulin-like Growth Factor (IGF) transport and uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)'},
 'NFYA': {'R-HSA-381183': 'ATF6 (ATF6-alpha) activates chaperone genes'},
 'M6PR': {'R-HSA-428157': 'Sphingolipid metabolism',
  'R-HSA-6811442': 'Intra-Golgi and retrograde Golgi-to-ER traffic',
  'R-HSA-199992': 'trans-Golgi Network Vesicle Budding',
  'R-HSA-8856828': 'Clathrin-mediated endocytosis'},
 'HCCS': {'R-HSA-611105': 'Respiratory electron transport'},
 'FAM214B': {'pathway': 'FAM214B'},
 'WDR54': {'pathway': 'WDR54'},
 'TMEM98': {'pathway': 'TMEM98'},
 'YBX2': {'pathway': 'YBX2'},
 'TMEM132A': {'R-HSA-381426': 'Regulation of Insulin-like Growth Factor (IGF) transport and uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)'},
 'GGCT': {'R-HSA-156580': 'Phase II - Conjugation of compounds'}}

In [62]:
rows = []
cs = 'pathway_' + np.arange(max_depth).astype(str)
for gene, pathways in gene_pathways.items():
    row = list(pathways.values()) + [None] * (max_depth - len(pathways))
    rows.append(row)
pw_info = pd.DataFrame(rows, index=gene_pathways.keys(), columns=cs)

In [63]:
pw_info

Unnamed: 0,pathway_0,pathway_1,pathway_2,pathway_3
FUCA2,Neutrophil degranulation,Regulation of Insulin-like Growth Factor (IGF)...,,
NFYA,ATF6 (ATF6-alpha) activates chaperone genes,,,
M6PR,Sphingolipid metabolism,Intra-Golgi and retrograde Golgi-to-ER traffic,trans-Golgi Network Vesicle Budding,Clathrin-mediated endocytosis
HCCS,Respiratory electron transport,,,
FAM214B,FAM214B,,,
WDR54,WDR54,,,
TMEM98,TMEM98,,,
YBX2,YBX2,,,
TMEM132A,Regulation of Insulin-like Growth Factor (IGF)...,,,
GGCT,Phase II - Conjugation of compounds,,,


In [10]:
# try for a single gene
gene = 'FUCA2'
gene_query = reactome_query + gene + base_filter_query
qr = get_query(gene_query)
reactome_ids = [e['stId'] for e in qr['results'][0]['entries']]

In [11]:
reactome_ids

['R-HSA-6806227', 'R-HSA-6798772', 'R-HSA-8956727', 'R-HSA-8957005']

'Neutrophil degranulation'