# Paper next results
- validate link between arrhythmia, myopathy, and the identified genes (SLCO1B1)
- validate the drugs involved in arrhythmia
    - run PoSe-Path on the drugs found in literature for arrhythmia; check for other side effects (and check in literature)
- validate the other promising heart side effects
    - explain (jointly) related side effects (having similar mechanisms) in the heart and across organs (e.g., heart muscle, other muscle
- validate muscle (myopathy) since connected with cardiovascular disease
- move on to validating liver (since SLCO1B1 is in liver), kidney, (brain)

## Explainer paper
1. Each single side effect --> Enriched GO --> validate with literature
1. Choose two side effects suspected to share mechanisms --> get common subgraph --> validate
1. Choose >2 side effects ...
1. Choose single interesting drug --> Enriched GO --> validate with literature
1. Choose interesting drug-drug interaction --> Enriched GO --> validate with literature

In [1]:
import goatools
import pandas as pd
import numpy as np
import scipy.stats as stats
import pubchempy
import pickle

from pubchempy import Compound

# Get http://geneontology.org/ontology/go-basic.obo
from goatools.base import download_go_basic_obo
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from __future__ import print_function
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

In [2]:
pd.set_option('display.max_colwidth', -1)

# Add the Side Effect  Disease Class

In [3]:
df_se = pd.read_csv('data/Se-DoDecagon_sidefx.csv')

In [8]:
df_se.shape

(561, 3)

In [11]:
df_se.head()

Unnamed: 0,Side Effect,Side Effect Name,Disease Class
0,C0017152,gastric inflammation,gastrointestinal system disease
1,C0027858,neuroma,benign neoplasm
2,C0041466,Typhoid,bacterial infectious disease
3,C0032807,Post thrombotic syndrome,cardiovascular system disease
4,C0033860,psoriasis,integumentary system disease


In [7]:
df_se['Disease Class'].unique()

array(['gastrointestinal system disease', 'benign neoplasm',
       'bacterial infectious disease', 'cardiovascular system disease',
       'integumentary system disease', 'nervous system disease',
       'cognitive disorder', 'urinary system disease',
       'hematopoietic system disease', 'endocrine system disease',
       'musculoskeletal system disease', 'psoriatic arthritis',
       'hematopoietic system diseases', 'viral infectious disease',
       'fungal infectious disease', 'respiratory system disease',
       'cancer', 'developmental disorder of mental health',
       'acquired metabolic disease', 'parasitic infectious disease',
       'thoracic disease', 'inherited metabolic disorder',
       'reproductive system disease', 'immune system disease',
       'monogenic disease', 'substance-related disorder',
       'personality disorder', 'hypospadias', 'sleep disorder',
       'sexual disorder', 'chromosomal disease',
       'polycystic ovary syndrome', 'somatoform disorder',
 

In [9]:
with open('/home/laurence/git/posepath/PoSe-Path-3.0/data/tipexp_data.pkl','rb') as f:
    mapping = pickle.load(f)

## Get disease class for SE where available

In [40]:
df_se0 = pd.DataFrame([{'Side Effect Name':k.lower(), 'idx':v} for k,v in mapping.side_effect_name_to_idx.items()])
df_se.loc[:,'Side Effect Name'] = df_se['Side Effect Name'].str.lower()
df_ses = pd.merge(df_se0, df_se, on='Side Effect Name', how='left')
df_ses.shape

(861, 4)

In [53]:
df_ses[df_ses['Disease Class'].isnull()]

Unnamed: 0,Side Effect Name,idx,Side Effect,Disease Class
0,atelectasis,0,,
1,back ache,1,,
3,agitated,3,,
4,abnormal movements,4,,
5,acidosis,5,,
7,apnea,7,,
8,drug hypersensitivity,8,,
9,flatulence,9,,
10,pain in throat,10,,
13,bradycardia,13,,


In [59]:
df_ses.dropna()[ df_ses['Disease Class'].dropna().str.contains('cancer')].sort_values('Side Effect Name')

Unnamed: 0,Side Effect Name,idx,Side Effect,Disease Class
390,acute myeloblastic leukemia,390,C0023467,cancer
61,adenocarcinoma,61,C0001418,cancer
574,basal cell carcinoma,574,C0007117,cancer
799,bladder cancer,799,C0699885,cancer
672,brain neoplasm,672,C0006118,cancer
85,breast cancer,85,C0678222,cancer
617,cancer,617,C0006826,cancer
688,carcinoma of prostate,688,C0600139,cancer
558,carcinoma of the colon,558,C0699790,cancer
821,esophageal cancer,821,C0152018,cancer


In [49]:
df_ses.groupby('Disease Class').count().sort_values('Side Effect', ascending=False)

Unnamed: 0_level_0,Side Effect Name,idx,Side Effect
Disease Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nervous system disease,46,46,46
cardiovascular system disease,45,45,45
gastrointestinal system disease,41,41,41
musculoskeletal system disease,27,27,27
integumentary system disease,21,21,21
respiratory system disease,21,21,21
cancer,20,20,20
urinary system disease,19,19,19
hematopoietic system disease,15,15,15
acquired metabolic disease,12,12,12


In [10]:
mapping.side_effect_name_to_idx['Arrhythmia']

508

In [5]:
import os

basepath = 'results/results/out/all-all-508-3.0-0.97'

In [6]:
#with open('../data/[91.0, 88.0]-[84.0, 84.0]-[28.0, 28.0]-1.0.pkl','rb') as f:
#with open('results/out/pkl/all-all-28-1.0-0.99.pkl','rb') as f:
#with open('results2/out/pkl/100,34,88-100,34,88-28-1.0-0.5.pkl','rb') as f:
#with open('results3/out/pkl/147-all-all-3.0-0.96.pkl','rb') as f:
#with open('results3/out/pkl/all-all-508-3.0-0.97.pkl','rb') as f:  # previous
#with open('results/results/out/all-all-536-3.0-0.97/exp_info.pkl','rb') as f:
with open(os.path.join(basepath, 'exp_info.pkl'),'rb') as f:
    data = pickle.load(f)
dmap = data.copy()

In [7]:
idx1 = data['pp_idx'][0]
idx2 = data['pp_idx'][1]

In [8]:
data['pp_id'] = [[int(mapping.prot_idx_to_id[idx].replace('GeneID','')) for idx in idx1],
                 [int(mapping.prot_idx_to_id[idx].replace('GeneID','')) for idx in idx2]]

### Check the namespace directly

In [9]:
df_go = pd.read_csv(os.path.join(basepath, 'name_space.csv'), index_col=0)

In [10]:
df_go.namespace.unique()

array(['biological_process', 'cellular_component', 'molecular_function'],
      dtype=object)

In [11]:
df_go[ df_go.namespace.str.contains('molecular')].sort_values('p_fdr_bh')

Unnamed: 0,name,namespace,p_fdr_bh
5,bile acid transmembrane transporter activity,molecular_function,0.00628
6,sodium-independent organic anion transmembrane transporter activity,molecular_function,0.006849
7,ATPase-coupled transmembrane transporter activity,molecular_function,0.037601


### Example 2: Clomipramine --> side effect --> PPI
- Drug 1: 147 (Clomipramine)
- Drug 2: 24 (all-->one)
- Side effect: 376 (aortic regurgitation)

Analyze after running below

set([mapping.side_effect_idx_to_name[k] for k in data['sd_idx']])

d2 = [mapping.drug_idx_to_id[k] for k in data['durg2']]

cids = [int(d.replace('CID','')) for d in d2]
drugs = [Compound.from_cid(int(cid)) for cid in set(cids)]

drugs

drug_ids = [(d.cid, d.synonyms[0], d.iupac_name) for d in drugs]
drug_ids

Find the side effects to run PoSe-Path

In [12]:
hearts = ['arrhythmia','cardiac','heart','ventric','atri','aort']

In [13]:
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in hearts])]
ses

[('heart attack', 186),
 ('arteriosclerotic heart disease', 202),
 ('Extrasystoles Ventricular', 239),
 ('Atrioventricular block first degree', 259),
 ('Cardiac decompensation', 287),
 ('heart rate increased', 327),
 ('aortic regurgitation', 358),
 ('cardiac murmur', 398),
 ('cardiac enlargement', 399),
 ('cardiac disease', 448),
 ('atrial septal defect', 453),
 ('atrial ectopic beats', 486),
 ('Arrhythmia', 508),
 ('Cardiac ischemia', 516),
 ('cardiac failure', 527),
 ('atrioventricular block second degree', 528),
 ('tachycardia ventricular', 535),
 ('atrial flutter', 538),
 ('cardiac valvulopathy', 560),
 ('Atrioventricular block complete', 562),
 ('ventricular fibrillation', 563),
 ('Cardiac tamponade', 580),
 ('Supraventricular tachycardia', 594),
 ('right heart failure', 615),
 ('aortic stenosis', 642),
 ('atrioventricular block', 656),
 ('congenital heart disease', 686),
 ('aortic aneurysm', 740),
 ('left ventricular hypertrophy', 773),
 ('Bradyarrhythmia', 791)]

In [14]:
len(mapping.side_effect_idx_to_name)
name_to_idx = {v:k for k, v in mapping.side_effect_idx_to_name.items()}

In [15]:
','.join([str(x[1]) for x in ses])

'186,202,239,259,287,327,358,398,399,448,453,486,508,516,527,528,535,538,560,562,563,580,594,615,642,656,686,740,773,791'

In [16]:
renal = ['kidne','renal']
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in renal])]
ses

[('Chronic Kidney Disease', 143),
 ('acute kidney failure', 144),
 ('kidney failure', 256),
 ('renal cyst', 277),
 ('adrenal insufficiency', 301),
 ('disorder Renal', 333),
 ('renal tubular acidosis', 501),
 ('renal mass', 690),
 ('hepatorenal syndrome', 711),
 ('kidney pain', 730),
 ('kidney transplant', 811)]

In [17]:
muscle = ['muscl','myopath']
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in muscle])]
ses

[('muscle spasm', 126),
 ('muscle paresis', 137),
 ('aching muscles', 175),
 ('muscle weakness', 190),
 ('Cardiomyopathy', 283),
 ('muscle strain', 416),
 ('muscle inflammation', 488),
 ('muscle disorder', 533)]

### Muscle / Heart / transporter genes
https://ascpt.onlinelibrary.wiley.com/doi/abs/10.1038/clpt.2013.234


https://www.ncbi.nlm.nih.gov/pubmed/29463526
- potentially strong assocation between arrythmia and SLCO1B1 due to increased risk of prolonged QTc interval

### GO enrichment

In [18]:
geneid2symbol = {v.GeneID: v.Symbol for k,v in GeneID2nt_hum.items()}

In [19]:

obo_fname = download_go_basic_obo()


fin_gene2go = download_ncbi_associations()

obodag = GODag("go-basic.obo")

# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

# Get namespace2association where:
#    namespace is:
#        BP: biological_process               
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))


goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(), # List of human protein-acoding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-01-01) 47,337 GO Terms
HMS:0:00:02.921323 323,107 annotations READ: gene2go 
1 taxids stored: 9606
BP 17,541 annotated human genes
MF 17,384 annotated human genes
CC 18,648 annotated human genes

Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 80% 16,711 of 20,913 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 85% 17,755 of 20,913 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 80% 16,699 of 20,913 population items found in association


In [20]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
# geneids_study = data['pp_id'][0] + data['pp_id'][1] # geneid2symbol.keys()
geneids_study = set(data['pp_id'][0] + data['pp_id'][1])
goea_results_all = goeaobj.run_study(geneids_study)


Run BP Gene Ontology Analysis: current study set of 4 IDs ...
100%      4 of      4 study items found in association
100%      4 of      4 study items found in population(20913)
Calculating 12,189 uncorrected p-values using fisher_scipy_stats
  12,189 GO terms are associated with 16,711 of 20,913 population items
      18 GO terms are associated with      4 of      4 study items
  METHOD fdr_bh:
       4 GO terms found significant (< 0.05=alpha) (  4 enriched +   0 purified): statsmodels fdr_bh
       4 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 4 IDs ...
100%      4 of      4 study items found in association
100%      4 of      4 study items found in population(20913)
Calculating 1,731 uncorrected p-values using fisher_scipy_stats
   1,731 GO terms are associated with 17,755 of 20,913 population items
      12 GO terms are associated with      4 of   

In [21]:
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

In [22]:
len(goea_results_sig)

8

In [23]:
g0 = goea_results_sig[0]
g0.study_items

{8647, 10599, 28234}

In [24]:
df_go.head()

Unnamed: 0,name,namespace,p_fdr_bh
0,bile acid and bile salt transport,biological_process,0.000105
1,transmembrane transport,biological_process,0.0002
2,organic anion transport,biological_process,0.010025
3,sodium-independent organic anion transport,biological_process,0.010025
4,basolateral plasma membrane,cellular_component,0.005575


In [25]:
g0 = goea_results_sig[0]
g0.goterm

GOTerm('GO:0015721'):
  id:GO:0015721
  item_id:GO:0015721
  name:bile acid and bile salt transport
  namespace:biological_process
  _parents: 3 items
    GO:0015850
    GO:0006869
    GO:0015718
  parents: 3 items
    GO:0006869	level-05	depth-05	lipid transport [biological_process]
    GO:0015850	level-05	depth-05	organic hydroxy compound transport [biological_process]
    GO:0015718	level-07	depth-08	monocarboxylic acid transport [biological_process]
  children: 1 items
    GO:0015722	level-06	depth-10	canalicular bile acid transport [biological_process]
  level:6
  depth:9
  is_obsolete:False
  alt_ids: 0 items

In [26]:
g0.goterm.__dict__['namespace']

'biological_process'

In [27]:
import pandas as pd

In [28]:
g0.__dict__['p_fdr_bh']

0.00010470005154705395

In [29]:
g0 = goea_results_sig[0]
g0.goterm.id

'GO:0015721'

In [30]:
keys = ['name','namespace','id']
df_go1 = pd.DataFrame([
        {k:g.goterm.__dict__.get(k) for k in keys}
        for g in goea_results_sig])
df_p = pd.DataFrame([{'p_fdr_bh':g.__dict__['p_fdr_bh']} for g in goea_results_sig])
df_go = df_go1.merge(df_p, left_index=True, right_index=True)
go_genes = pd.DataFrame([{'id':g.goterm.id, 'gene':s,'symbol':geneid2symbol[s]} for g in goea_results_sig for s in g.study_items])
df_go = df_go.merge(go_genes, on='id')

In [31]:
df_go.groupby('namespace').count()

Unnamed: 0_level_0,name,id,p_fdr_bh,gene,symbol
namespace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
biological_process,11,11,11,11,11
cellular_component,3,3,3,3,3
molecular_function,6,6,6,6,6


In [32]:
import cobra
from cobra.io import load_json_model

In [33]:
model = load_json_model('/home/laurence/models/BiGG_M/json/Recon3D.json')

In [34]:
model.optimize()

Unnamed: 0,fluxes,reduced_costs
10FTHF5GLUtl,0.000000,0.0
10FTHF5GLUtm,0.000000,0.0
10FTHF6GLUtl,0.000000,0.0
10FTHF6GLUtm,0.000000,0.0
10FTHF7GLUtl,0.000000,0.0
...,...,...
CYOR_u10mi,433.920904,0.0
Htmi,0.000000,0.0
NADH2_u10mi,0.000000,0.0
CYOOm3i,0.000000,0.0


In [35]:
genes_sig = df_go.gene.unique()
genes_sig

array([28234, 10599,  8647, 10257])

In [36]:
M_gene_ids = [int(g.id.split('_')[0].strip()) for g in model.genes]

In [37]:
genes_sig_M = [g for g in genes_sig if g in M_gene_ids]
len(genes_sig_M)

4

In [38]:
df_g_rxn = pd.DataFrame([{'gene':int(g.id.split('_')[0].strip()),'rxn':r.reaction,
                          'rxn_id':r.id, 'rxn_name':r.name, 'subsystem':r.subsystem}
                         for g in model.genes for r in g.reactions])

In [39]:
df_g_rxn

Unnamed: 0,gene,rxn,rxn_id,rxn_name,subsystem
0,0,hdca24g_c <=> hdca24g_r,HDCA24Gtr,Glucuronidated Compound Transport (HDCA-24G),"Transport, extracellular"
1,8639,h2o_c + mma_c + o2_c --> fald_c + h2o2_c + nh4_c,MAOX,Methylamine:oxygen oxidoreductase (deaminating) (copper-containing),Tyrosine metabolism
2,8639,aact_c + h2o_c + o2_c --> h2o2_c + mthgxl_c + nh4_c,AACTOOR,Aminoacetone:oxygen oxidoreductase(deaminating)(flavin-containing),"Glycine, serine, alanine, and threonine metabolism"
3,8639,h2o_c + o2_c + ptrc_c --> 4abutn_c + h2o2_c + nh4_c,PTRCOX1,Putrescine:oxygen oxidoreductase (deaminating),Arginine and proline metabolism
4,8639,dopa_c + h2o_c + o2_c --> 34dhpac_c + h2o2_c + nh4_c,42A12BOOX,"4-(2-Aminoethyl)-1,2-benzenediol:oxygen oxidoreductase(deaminating)(flavin-containing)",Tyrosine metabolism
5,8639,h2o_c + mhista_c + o2_c --> 3mldz_c + h2o2_c + nh4_c,MHISOR,N-Methylhistamine:oxygen oxidoreductase (deaminating),Histidine metabolism
6,8639,13dampp_c + h2o_c + o2_c --> bamppald_c + h2o2_c + nh4_c,13DAMPPOX,"1,3-Diaminopropane:oxygen oxidoreductase (deaminating)",Beta-Alanine metabolism
7,314,dopa_c + h2o_c + o2_c --> 34dhpac_c + h2o2_c + nh4_c,42A12BOOX,"4-(2-Aminoethyl)-1,2-benzenediol:oxygen oxidoreductase(deaminating)(flavin-containing)",Tyrosine metabolism
8,314,h2o_c + o2_c + peamn_c --> h2o2_c + nh4_c + pacald_c,PEAMNO,Phenethylamine oxidase,Phenylalanine metabolism
9,314,h2o_c + mma_c + o2_c --> fald_c + h2o2_c + nh4_c,MAOX,Methylamine:oxygen oxidoreductase (deaminating) (copper-containing),Tyrosine metabolism


In [40]:
sol = model.optimize()

In [43]:
df_go.loc[:,'in_M'] = df_go.gene.isin(genes_sig_M)

In [44]:
df_go_rxn = pd.merge(df_go, df_g_rxn, on='gene', how='left')

In [45]:
df_go_mol = df_go_rxn[df_go_rxn.namespace.str.contains('molecular')].sort_values(['p_fdr_bh','namespace','name'])
cols = ['name','p_fdr_bh','symbol','rxn','rxn_id','rxn_name','subsystem']
df_go_mol[df_go_mol.in_M][cols].head()

Unnamed: 0,name,p_fdr_bh,symbol,rxn,rxn_id,rxn_name,subsystem
1101,bile acid transmembrane transporter activity,0.00628,SLCO1B3,hco3_c + lca3s_e <=> hco3_e + lca3s_c,LCA3St,Lithocholic acid 3-sulfate transport via bicarbonate countertransport,"Transport, extracellular"
1102,bile acid transmembrane transporter activity,0.00628,SLCO1B3,dca3s_e + hco3_c <=> dca3s_c + hco3_e,DCA3St,Deoxycholic acid 3-sulfate transport via bicarbonate countertransport,"Transport, extracellular"
1103,bile acid transmembrane transporter activity,0.00628,SLCO1B3,7dhcdchol_e + hco3_c <=> 7dhcdchol_c + hco3_e,7DHCDCHOLt,7-Dehydrochenodeoxycholic acid transport via bicarbonate countertransport,"Transport, extracellular"
1104,bile acid transmembrane transporter activity,0.00628,SLCO1B3,3dhchol_e + hco3_c <=> 3dhchol_c + hco3_e,3DHCHOLt,3-Dehydrocholic acid transport via bicarbonate countertransport,"Transport, extracellular"
1105,bile acid transmembrane transporter activity,0.00628,SLCO1B3,ca3s_e + hco3_c <=> ca3s_c + hco3_e,CA3St,Cholic acid 3-sulfate transport via bicarbonate countertransport,"Transport, extracellular"


In [46]:
cols2 = ['p_fdr_bh','subsystem']
df_go_mol[df_go_mol.in_M][cols2].drop_duplicates().groupby('subsystem').median().sort_values('p_fdr_bh')

Unnamed: 0_level_0,p_fdr_bh
subsystem,Unnamed: 1_level_1
Drug metabolism,0.006849
"Transport, extracellular",0.006849
Bile acid synthesis,0.037601


# Literature validation

-  "alterations in FA metabolism could play a pivotal role in production of arrhythmias":
https://www.fasebj.org/doi/abs/10.1096/fasebj.31.1_supplement.782.14


In [50]:
df_go_mol.subsystem.unique()

array(['Transport, extracellular', 'Drug metabolism',
       'Bile acid synthesis'], dtype=object)

In [51]:
df_go[df_go.name.str.contains('channel')]

Unnamed: 0,name,namespace,id,p_fdr_bh,gene,symbol,in_M


---

## Last step: show using simulation the mechanism by which FA oxidation (and others) cause the Arrhythmia side effect

Steps:
1. make heart-specific model
1. simulate healthy heart metabolism (linked to electrical conduction)
1. simulate arrhythmia
    1. by perturbing the PoSe-Path genes. Hypothesis: perturbing these genes causes arrhythmia, or that arrhythmia involves change in the reactions encoded by these genes
    1. by using known arrhythmia simulation parameters, and check consistency of known arrhythmia mechanisms with the genes found by PoSe-Path [X: DON'T already know arrhythmia parameters]
1. Go with A.
    - Validate using another metabolic marker of arrhythmia--show that it changes in the correct direction
    - Validate using omics (RNA-Seq, proteomics, metabolomics) data from arrhythmia patients


In [None]:
sol

-----

geneid2info = {v.GeneID: (v.Symbol,v.description) for k,v in GeneID2nt_hum.items()}

[geneid2info[g] for g in geneids_study if g in geneid2info]

gg = goea_results_sig[0]
gg.GO

https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb

from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj


# Plot subset starting from these significant GO terms
goid_subset = [g.GO for g in goea_results_sig] # [
#     'GO:0003723', # MF D04 RNA binding (32 genes)
#     'GO:0044822', # MF D05 poly(A) RNA binding (86 genes)
#     'GO:0003729', # MF D06 mRNA binding (11 genes)
#     'GO:0019843', # MF D05 rRNA binding (6 genes)
#     'GO:0003746', # MF D06 translation elongation factor activity (5 genes)
# ]
plot_gos("results3/out/go_enrich.png", 
    goid_subset, # Source GO ids
    obodag, 
    goea_results=goea_results_all) # Use pvals for coloring
# plot_results("nbt3102_BP.png", goea_results_sig)

This plot contains GOEA results:

- GO terms colored by P-value:
    - pval < 0.005 (light red)
    - pval < 0.01 (light orange)
    - pval < 0.05 (yellow)
    - pval > 0.05 (grey) Study terms that are not statistically significant
- GO terms with study gene counts printed. e.g., "32 genes"

plot_gos("results3/out/go_enrich_symbols.pdf", 
    goid_subset, # Source GO ids
    obodag,
    goea_results=goea_results_all, # use pvals for coloring
    # We can further configure the plot...
    id2symbol=geneid2symbol, # Print study gene Symbols, not Entrez GeneIDs
    study_items=6, # Only only 6 gene Symbols max on GO terms
    items_p_line=3, # Print 3 genes per line
    )