# Paper next results
- validate link between arrhythmia, myopathy, and the identified genes (SLCO1B1)
- validate the drugs involved in arrhythmia
    - run PoSe-Path on the drugs found in literature for arrhythmia; check for other side effects (and check in literature)
- validate the other promising heart side effects
    - explain (jointly) related side effects (having similar mechanisms) in the heart and across organs (e.g., heart muscle, other muscle
- validate muscle (myopathy) since connected with cardiovascular disease
- move on to validating liver (since SLCO1B1 is in liver), kidney, (brain)

## Explainer paper
1. Each single side effect --> Enriched GO --> validate with literature
1. Choose two side effects suspected to share mechanisms --> get common subgraph --> validate
1. Choose >2 side effects ...
1. Choose single interesting drug --> Enriched GO --> validate with literature
1. Choose interesting drug-drug interaction --> Enriched GO --> validate with literature

In [1]:
import goatools
import pandas as pd
import numpy as np
import scipy.stats as stats
import pubchempy
import pickle

from pubchempy import Compound

# Get http://geneontology.org/ontology/go-basic.obo
from goatools.base import download_go_basic_obo
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from __future__ import print_function
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

In [10]:
pd.set_option('display.max_colwidth', -1)

In [159]:
with open('/home/laurence/git/posepath/PoSe-Path-3.0/data/tipexp_data.pkl','rb') as f:
    mapping = pickle.load(f)

In [160]:
mapping.side_effect_name_to_idx['Arrhythmia']

536

In [161]:
mapping.side_effect_idx_to_name

{2: 'atelectasis',
 4: 'Back Ache',
 5: 'lung edema',
 6: 'agitated',
 7: 'abnormal movements',
 8: 'Acidosis',
 9: 'peliosis',
 11: 'Apnea',
 12: 'Drug hypersensitivity',
 13: 'flatulence',
 14: 'pain in throat',
 15: 'allergies',
 16: 'thrombocytopenia',
 17: 'bradycardia',
 18: 'lung infiltration',
 19: 'Bleeding',
 20: 'hypoglycaemia neonatal',
 22: 'hyperglycaemia',
 23: 'peritonitis',
 24: 'hypoglycaemia',
 25: 'abdominal distension',
 26: 'asystole',
 27: 'cerebral infarct',
 28: 'hypoxia',
 29: 'Difficulty breathing',
 30: 'decreased body temperature',
 31: 'adynamic ileus',
 32: 'sepsis',
 33: 'Hypomagnesaemia',
 35: 'Acute Respiratory Distress Syndrome',
 36: 'lipoma',
 38: 'appendectomy',
 39: 'flank pain',
 40: 'pneumothorax',
 41: 'appendicitis',
 42: 'Strabismus',
 43: 'Blood calcium decreased',
 44: 'respiratory failure',
 45: 'pulmonary hypertension',
 46: 'Hypercapnia',
 47: 'intestinal perforation',
 48: 'leucocytosis',
 49: 'Disorder Lung',
 50: 'hot flash',
 51: 'ar

drugs = ['CID3883','CID4594','CID4679']

In [4]:
import os

basepath = 'results/results/out/all-all-536-3.0-0.97'

In [5]:
#with open('../data/[91.0, 88.0]-[84.0, 84.0]-[28.0, 28.0]-1.0.pkl','rb') as f:
#with open('results/out/pkl/all-all-28-1.0-0.99.pkl','rb') as f:
#with open('results2/out/pkl/100,34,88-100,34,88-28-1.0-0.5.pkl','rb') as f:
#with open('results3/out/pkl/147-all-all-3.0-0.96.pkl','rb') as f:
#with open('results3/out/pkl/all-all-508-3.0-0.97.pkl','rb') as f:  # previous
#with open('results/results/out/all-all-536-3.0-0.97/exp_info.pkl','rb') as f:
with open(os.path.join(basepath, 'exp_info.pkl'),'rb') as f:
    data = pickle.load(f)
dmap = data.copy()

In [6]:
idx1 = data['pp_idx'][0]
idx2 = data['pp_idx'][1]

In [7]:
data['pp_id'] = [[int(mapping.prot_idx_to_id[idx].replace('GeneID','')) for idx in idx1],
                 [int(mapping.prot_idx_to_id[idx].replace('GeneID','')) for idx in idx2]]

### Check the namespace directly

In [8]:
df_go = pd.read_csv(os.path.join(basepath, 'name_space.csv'), index_col=0)

In [9]:
df_go.namespace.unique()

array(['biological_process', 'cellular_component', 'molecular_function'],
      dtype=object)

In [11]:
df_go[ df_go.namespace.str.contains('molecular')].sort_values('p_fdr_bh')

Unnamed: 0,name,namespace,p_fdr_bh
32,"oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, reduced flavin or flavoprotein as one donor, and incorporation of one atom of oxygen",molecular_function,0.007001
33,chloride channel activity,molecular_function,0.007055
34,steroid hydroxylase activity,molecular_function,0.007055
35,inhibitory extracellular ligand-gated ion channel activity,molecular_function,0.010183
36,benzodiazepine receptor activity,molecular_function,0.010183
37,monooxygenase activity,molecular_function,0.010183
38,dimethylallyltranstransferase activity,molecular_function,0.010183
39,geranyltranstransferase activity,molecular_function,0.010183
40,GABA-gated chloride ion channel activity,molecular_function,0.010908
41,heme binding,molecular_function,0.010941


### Example 2: Clomipramine --> side effect --> PPI
- Drug 1: 147 (Clomipramine)
- Drug 2: 24 (all-->one)
- Side effect: 376 (aortic regurgitation)

Analyze after running below

set([mapping.side_effect_idx_to_name[k] for k in data['sd_idx']])

In [12]:
data.keys()

dict_keys(['pp_idx', 'pp_weight', 'pd_idx', 'pd_weight', 'drug1', 'durg2', 'side_effect', 'probability', 'pp_id'])

In [13]:
d2 = [mapping.drug_idx_to_id[k] for k in data['durg2']]

cids = [int(d.replace('CID','')) for d in d2]
drugs = [Compound.from_cid(int(cid)) for cid in set(cids)]

drugs

drug_ids = [(d.cid, d.synonyms[0], d.iupac_name) for d in drugs]
drug_ids

Find the side effects to run PoSe-Path

In [13]:
hearts = ['arrhythmia','cardiac','heart','ventric','atri','aort']

In [14]:
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in hearts])]
ses

[('heart attack', 186),
 ('arteriosclerotic heart disease', 202),
 ('Extrasystoles Ventricular', 239),
 ('Atrioventricular block first degree', 259),
 ('Cardiac decompensation', 287),
 ('heart rate increased', 327),
 ('aortic regurgitation', 358),
 ('cardiac murmur', 398),
 ('cardiac enlargement', 399),
 ('cardiac disease', 448),
 ('atrial septal defect', 453),
 ('atrial ectopic beats', 486),
 ('Arrhythmia', 508),
 ('Cardiac ischemia', 516),
 ('cardiac failure', 527),
 ('atrioventricular block second degree', 528),
 ('tachycardia ventricular', 535),
 ('atrial flutter', 538),
 ('cardiac valvulopathy', 560),
 ('Atrioventricular block complete', 562),
 ('ventricular fibrillation', 563),
 ('Cardiac tamponade', 580),
 ('Supraventricular tachycardia', 594),
 ('right heart failure', 615),
 ('aortic stenosis', 642),
 ('atrioventricular block', 656),
 ('congenital heart disease', 686),
 ('aortic aneurysm', 740),
 ('left ventricular hypertrophy', 773),
 ('Bradyarrhythmia', 791)]

In [15]:
len(mapping.side_effect_idx_to_name)
name_to_idx = {v:k for k, v in mapping.side_effect_idx_to_name.items()}

In [16]:
','.join([str(x[1]) for x in ses])

'186,202,239,259,287,327,358,398,399,448,453,486,508,516,527,528,535,538,560,562,563,580,594,615,642,656,686,740,773,791'

In [17]:
renal = ['kidne','renal']
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in renal])]
ses

[('Chronic Kidney Disease', 143),
 ('acute kidney failure', 144),
 ('kidney failure', 256),
 ('renal cyst', 277),
 ('adrenal insufficiency', 301),
 ('disorder Renal', 333),
 ('renal tubular acidosis', 501),
 ('renal mass', 690),
 ('hepatorenal syndrome', 711),
 ('kidney pain', 730),
 ('kidney transplant', 811)]

In [57]:
muscle = ['muscl','myopath']
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in muscle])]
ses

[('muscle spasm', 126),
 ('muscle paresis', 137),
 ('aching muscles', 175),
 ('muscle weakness', 190),
 ('Cardiomyopathy', 283),
 ('muscle strain', 416),
 ('muscle inflammation', 488),
 ('muscle disorder', 533)]

### Muscle / Heart / transporter genes
https://ascpt.onlinelibrary.wiley.com/doi/abs/10.1038/clpt.2013.234


https://www.ncbi.nlm.nih.gov/pubmed/29463526
- potentially strong assocation between arrythmia and SLCO1B1 due to increased risk of prolonged QTc interval

### GO enrichment

In [27]:
geneid2symbol = {v.GeneID: v.Symbol for k,v in GeneID2nt_hum.items()}

In [12]:

obo_fname = download_go_basic_obo()


fin_gene2go = download_ncbi_associations()

obodag = GODag("go-basic.obo")

# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

# Get namespace2association where:
#    namespace is:
#        BP: biological_process               
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))


goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(), # List of human protein-acoding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method

requests.get(http://purl.obolibrary.org/obo/go/go-basic.obo, stream=True)
  WROTE: go-basic.obo

FTP RETR ftp.ncbi.nlm.nih.gov gene/DATA gene2go.gz -> gene2go.gz
  gunzip gene2go.gz
go-basic.obo: fmt(1.2) rel(2020-01-01) 47,337 GO Terms
HMS:0:00:04.254678 323,107 annotations READ: gene2go 
1 taxids stored: 9606
MF 17,384 annotated human genes
CC 18,648 annotated human genes
BP 17,541 annotated human genes

Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 80% 16,711 of 20,913 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 85% 17,755 of 20,913 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 80% 16,699 of 20,913 population items found in association


In [15]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
# geneids_study = data['pp_id'][0] + data['pp_id'][1] # geneid2symbol.keys()
geneids_study = set(data['pp_id'][0] + data['pp_id'][1])
goea_results_all = goeaobj.run_study(geneids_study)


Run BP Gene Ontology Analysis: current study set of 94 IDs ...
 94%     87 of     93 study items found in association
 99%     93 of     94 study items found in population(20913)
Calculating 12,189 uncorrected p-values using fisher_scipy_stats
  12,189 GO terms are associated with 16,711 of 20,913 population items
     693 GO terms are associated with     87 of     94 study items
  METHOD fdr_bh:
      25 GO terms found significant (< 0.05=alpha) ( 25 enriched +   0 purified): statsmodels fdr_bh
      51 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 94 IDs ...
 98%     91 of     93 study items found in association
 99%     93 of     94 study items found in population(20913)
Calculating 1,731 uncorrected p-values using fisher_scipy_stats
   1,731 GO terms are associated with 17,755 of 20,913 population items
     158 GO terms are associated with     91 of 

In [16]:
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

In [17]:
len(goea_results_sig)

45

In [32]:
g0 = goea_results_sig[0]
g0.study_items

{2222, 2224, 3156, 4047, 4598, 9453, 51478}

In [36]:
df_go.head()

Unnamed: 0,name,namespace,p_fdr_bh
0,cholesterol biosynthetic process,biological_process,5e-06
1,steroid metabolic process,biological_process,5e-06
2,regulation of cholesterol biosynthetic process,biological_process,2.4e-05
3,xenobiotic metabolic process,biological_process,2.4e-05
4,exogenous drug catabolic process,biological_process,9.4e-05


In [37]:
g0 = goea_results_sig[0]
g0.goterm

GOTerm('GO:0006695'):
  id:GO:0006695
  item_id:GO:0006695
  name:cholesterol biosynthetic process
  namespace:biological_process
  _parents: 3 items
    GO:1902653
    GO:0016126
    GO:0008203
  parents: 3 items
    GO:0008203	level-05	depth-06	cholesterol metabolic process [biological_process]
    GO:1902653	level-05	depth-06	secondary alcohol biosynthetic process [biological_process]
    GO:0016126	level-05	depth-06	sterol biosynthetic process [biological_process]
  children: 3 items
    GO:0033490	level-07	depth-08	cholesterol biosynthetic process via lathosterol [biological_process]
    GO:0033488	level-07	depth-08	cholesterol biosynthetic process via 24,25-dihydrolanosterol [biological_process]
    GO:0033489	level-07	depth-08	cholesterol biosynthetic process via desmosterol [biological_process]
  level:6
  depth:7
  is_obsolete:False
  alt_ids: 0 items

In [38]:
g0.goterm.__dict__['namespace']

'biological_process'

In [39]:
import pandas as pd

In [40]:
g0.__dict__['p_fdr_bh']

4.712216364885412e-06

In [62]:
g0 = goea_results_sig[0]
g0.goterm.id

'GO:0006695'

In [68]:
keys = ['name','namespace','id']
df_go1 = pd.DataFrame([
        {k:g.goterm.__dict__.get(k) for k in keys}
        for g in goea_results_sig])
df_p = pd.DataFrame([{'p_fdr_bh':g.__dict__['p_fdr_bh']} for g in goea_results_sig])
df_go = df_go1.merge(df_p, left_index=True, right_index=True)
go_genes = pd.DataFrame([{'id':g.goterm.id, 'gene':s,'symbol':geneid2symbol[s]} for g in goea_results_sig for s in g.study_items])
df_go = df_go.merge(go_genes, on='id')

In [69]:
df_go.groupby('namespace').count()

Unnamed: 0_level_0,name,id,p_fdr_bh,gene,symbol
namespace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
biological_process,126,126,126,126,126
cellular_component,102,102,102,102,102
molecular_function,55,55,55,55,55


In [73]:
import cobra
from cobra.io import load_json_model

In [74]:
model = load_json_model('/home/laurence/models/BiGG_M/json/Recon3D.json')

In [76]:
model.optimize()

Unnamed: 0,fluxes,reduced_costs
10FTHF5GLUtl,0.000000,0.0
10FTHF5GLUtm,0.000000,0.0
10FTHF6GLUtl,0.000000,0.0
10FTHF6GLUtm,0.000000,0.0
10FTHF7GLUtl,0.000000,0.0
...,...,...
CYOR_u10mi,433.920904,0.0
Htmi,0.000000,0.0
NADH2_u10mi,0.000000,0.0
CYOOm3i,0.000000,0.0


In [81]:
genes_sig = df_go.gene.unique()
genes_sig

array([  9453,   2222,   4047,   2224,   3156,  51478,   4598,   8608,
        54658,   1555,   1557,   1558,   8856,   1565,   2941,  54578,
         3326,   9376,   6529,   6532,  10599,   8647,  28234,  10257,
        11160,   6554,   5243,   8399,   1071,   9970, 116150,    857,
         1536,  55107,    483,   5349,   2555,   2556,   2558,  57107,
         9992,   3757,   3269,   3786,   3274,   6623,   7363, 151056,
         2515,   7173,   3290,   6534,  10900,  93589,    924,   5578,
       134864,  83795,   5590,   1636,    356,   1006,  11120, 440435,
         3958,  10993,    818])

In [84]:
M_gene_ids = [int(g.id.split('_')[0].strip()) for g in model.genes]

In [87]:
genes_sig_M = [g for g in genes_sig if g in M_gene_ids]
len(genes_sig_M)

31

In [118]:
df_g_rxn = pd.DataFrame([{'gene':int(g.id.split('_')[0].strip()),'rxn':r.reaction,
                          'rxn_id':r.id, 'rxn_name':r.name, 'subsystem':r.subsystem}
                         for g in model.genes for r in g.reactions])

In [119]:
df_g_rxn

Unnamed: 0,gene,rxn,rxn_id,rxn_name,subsystem
0,0,hdca24g_c <=> hdca24g_r,HDCA24Gtr,Glucuronidated Compound Transport (HDCA-24G),"Transport, extracellular"
1,8639,h2o_c + o2_c + ptrc_c --> 4abutn_c + h2o2_c + nh4_c,PTRCOX1,Putrescine:oxygen oxidoreductase (deaminating),Arginine and proline metabolism
2,8639,dopa_c + h2o_c + o2_c --> 34dhpac_c + h2o2_c + nh4_c,42A12BOOX,"4-(2-Aminoethyl)-1,2-benzenediol:oxygen oxidoreductase(deaminating)(flavin-containing)",Tyrosine metabolism
3,8639,h2o_c + mhista_c + o2_c --> 3mldz_c + h2o2_c + nh4_c,MHISOR,N-Methylhistamine:oxygen oxidoreductase (deaminating),Histidine metabolism
4,8639,13dampp_c + h2o_c + o2_c --> bamppald_c + h2o2_c + nh4_c,13DAMPPOX,"1,3-Diaminopropane:oxygen oxidoreductase (deaminating)",Beta-Alanine metabolism
5,8639,h2o_c + mma_c + o2_c --> fald_c + h2o2_c + nh4_c,MAOX,Methylamine:oxygen oxidoreductase (deaminating) (copper-containing),Tyrosine metabolism
6,8639,aact_c + h2o_c + o2_c --> h2o2_c + mthgxl_c + nh4_c,AACTOOR,Aminoacetone:oxygen oxidoreductase(deaminating)(flavin-containing),"Glycine, serine, alanine, and threonine metabolism"
7,314,h2o_c + o2_c + peamn_c --> h2o2_c + nh4_c + pacald_c,PEAMNO,Phenethylamine oxidase,Phenylalanine metabolism
8,314,dopa_c + h2o_c + o2_c --> 34dhpac_c + h2o2_c + nh4_c,42A12BOOX,"4-(2-Aminoethyl)-1,2-benzenediol:oxygen oxidoreductase(deaminating)(flavin-containing)",Tyrosine metabolism
9,314,h2o_c + mma_c + o2_c --> fald_c + h2o2_c + nh4_c,MAOX,Methylamine:oxygen oxidoreductase (deaminating) (copper-containing),Tyrosine metabolism


In [120]:
sol = model.optimize()

In [121]:
sol.fluxes['35DSMVhep']

0.0

In [122]:
sol.fluxes[set(df_go_mol.rxn_id.unique())]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


NaN          NaN         
RDH1          0.000000   
HMGCOARx      50.724638  
P4502C94      0.000000   
RE2235R       0.000000   
                ...      
RE2147C       1000.000000
SRTNt6_2_r   -500.000000 
6CSMVhep      0.000000   
HMR_0942      0.000000   
UGT1A2r       0.000000   
Name: fluxes, Length: 80, dtype: float64

In [123]:
df_go.loc[:,'in_M'] = df_go.gene.isin(genes_sig_M)

In [124]:
df_go_rxn = pd.merge(df_go, df_g_rxn, on='gene', how='left')

In [125]:
df_go_mol = df_go_rxn[df_go_rxn.namespace.str.contains('molecular')].sort_values(['p_fdr_bh','namespace','name'])
cols = ['name','p_fdr_bh','symbol','rxn','rxn_id','rxn_name','subsystem']
df_go_mol[df_go_mol.in_M][cols].head()

Unnamed: 0,name,p_fdr_bh,symbol,rxn,rxn_id,rxn_name,subsystem
2772,"oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, reduced flavin or flavoprotein as one donor, and incorporation of one atom of oxygen",0.007001,CYP2D6,h_r + nadph_r + o2_r + smv_r --> 35dsmv_r + h2o_r + nadp_r,35DSMVhep,"Formation of 3,5-dihydrodoil simvastatin",Drug metabolism
2773,"oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, reduced flavin or flavoprotein as one donor, and incorporation of one atom of oxygen",0.007001,CYP2D6,6hsmv_r --> 3hsmv_r,3HSMVhep,Acid catalyzed rearrangement of 6-beta-hydroxy-simvastatin to 3-hydroxy simvastatin,Drug metabolism
2774,"oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, reduced flavin or flavoprotein as one donor, and incorporation of one atom of oxygen",0.007001,CYP2D6,h_r + nadph_r + o2_r + smv_r --> 6msmv_r + 2.0 h2o_r + nadp_r,6MSMVhep,Oxidation of simvastatin to 6-exomethylene lactone form,Drug metabolism
2775,"oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, reduced flavin or flavoprotein as one donor, and incorporation of one atom of oxygen",0.007001,CYP2D6,glz_r + h_r + nadph_r + o2_r --> 6bhglz_r + h2o_r + nadp_r,6BHGLZhr,Oxidation of gliclazide to 6-beta-OH-gliclazide in hepatocytes,Drug metabolism
2776,"oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, reduced flavin or flavoprotein as one donor, and incorporation of one atom of oxygen",0.007001,CYP2D6,estrone_r + h_r + nadph_r + o2_r --> C05300_r + h2o_r + nadp_r,RE2235R,Unspecific Monooxygenase,Androgen and estrogen synthesis and metabolism


In [136]:
cols2 = ['p_fdr_bh','subsystem']
df_go_mol[df_go_mol.in_M][cols2].drop_duplicates().groupby('subsystem').median().sort_values('p_fdr_bh')

Unnamed: 0_level_0,p_fdr_bh
subsystem,Unnamed: 1_level_1
Fatty acid oxidation,0.008619
Androgen and estrogen synthesis and metabolism,0.010183
Tryptophan metabolism,0.010183
Steroid metabolism,0.010183
N-glycan metabolism,0.010183
Limonene and pinene degradation,0.010183
Linoleate metabolism,0.010183
Cytochrome metabolism,0.010183
Arachidonic acid metabolism,0.010183
Drug metabolism,0.010562


# Literature validation

-  "alterations in FA metabolism could play a pivotal role in production of arrhythmias":
https://www.fasebj.org/doi/abs/10.1096/fasebj.31.1_supplement.782.14


In [126]:
df_go_mol[ df_go_mol.rxn_id.isin(['HMGCOARx'])]

Unnamed: 0,name,namespace,id,p_fdr_bh,gene,symbol,in_M,rxn,rxn_id,rxn_name,subsystem
3233,protein homodimerization activity,molecular_function,GO:0042803,0.032488,3156,HMGCR,True,2.0 h_x + hmgcoa_x + 2.0 nadph_x --> coa_x + mev__R_x + 2.0 nadp_x,HMGCOARx,Hydroxymethylglutaryl CoA reductase (ir),Cholesterol metabolism


In [139]:
len(model.reactions)

10600

In [140]:
len(model.metabolites)

5835

In [127]:
df_go_mol.subsystem.unique()

array(['Drug metabolism',
       'Androgen and estrogen synthesis and metabolism',
       'Arachidonic acid metabolism', 'Steroid metabolism',
       'Fatty acid oxidation', 'Linoleate metabolism',
       'Cytochrome metabolism', 'Tryptophan metabolism',
       'Limonene and pinene degradation', nan, 'N-glycan metabolism',
       'Cholesterol metabolism', 'Tyrosine metabolism',
       'Purine catabolism', 'Vitamin A metabolism', 'Miscellaneous',
       'Transport, extracellular',
       'Glycine, serine, alanine, and threonine metabolism',
       'Methionine and cysteine metabolism'], dtype=object)

In [148]:
df_go[df_go.name.str.contains('channel')]

Unnamed: 0,name,namespace,id,p_fdr_bh,gene,symbol,in_M
222,chloride channel complex,cellular_component,GO:0034707,0.01629,55107,ANO1,False
223,chloride channel complex,cellular_component,GO:0034707,0.01629,2556,GABRA3,False
224,chloride channel complex,cellular_component,GO:0034707,0.01629,2558,GABRA5,False
225,chloride channel complex,cellular_component,GO:0034707,0.01629,2555,GABRA2,False
232,chloride channel activity,molecular_function,GO:0005254,0.007055,55107,ANO1,False
233,chloride channel activity,molecular_function,GO:0005254,0.007055,5349,FXYD3,False
234,chloride channel activity,molecular_function,GO:0005254,0.007055,2555,GABRA2,False
235,chloride channel activity,molecular_function,GO:0005254,0.007055,2556,GABRA3,False
236,chloride channel activity,molecular_function,GO:0005254,0.007055,2558,GABRA5,False
244,inhibitory extracellular ligand-gated ion channel activity,molecular_function,GO:0005237,0.010183,2555,GABRA2,False


---

In [151]:
mapping.side_effect_name_to_idx['Chronic Kidney Disease']

153

## Last step: show using simulation the mechanism by which FA oxidation (and others) cause the Arrhythmia side effect

Steps:
1. make heart-specific model
1. simulate healthy heart metabolism (linked to electrical conduction)
1. simulate arrhythmia
    1. by perturbing the PoSe-Path genes. Hypothesis: perturbing these genes causes arrhythmia, or that arrhythmia involves change in the reactions encoded by these genes
    1. by using known arrhythmia simulation parameters, and check consistency of known arrhythmia mechanisms with the genes found by PoSe-Path [X: DON'T already know arrhythmia parameters]
1. Go with A.
    - Validate using another metabolic marker of arrhythmia--show that it changes in the correct direction
    - Validate using omics (RNA-Seq, proteomics, metabolomics) data from arrhythmia patients


In [138]:
sol

Unnamed: 0,fluxes,reduced_costs
10FTHF5GLUtl,0.000000,0.0
10FTHF5GLUtm,0.000000,0.0
10FTHF6GLUtl,0.000000,0.0
10FTHF6GLUtm,0.000000,0.0
10FTHF7GLUtl,0.000000,0.0
...,...,...
CYOR_u10mi,433.920904,0.0
Htmi,0.000000,0.0
NADH2_u10mi,0.000000,0.0
CYOOm3i,0.000000,0.0


-----

In [45]:
geneid2info = {v.GeneID: (v.Symbol,v.description) for k,v in GeneID2nt_hum.items()}

In [46]:
[geneid2info[g] for g in geneids_study if g in geneid2info]

[('CYBB', 'cytochrome b-245, beta polypeptide'),
 ('TPO', 'thyroid peroxidase'),
 ('PLB1', 'phospholipase B1'),
 ('ABCC4', 'ATP binding cassette subfamily C member 4'),
 ('CYP2B6', 'cytochrome P450 family 2 subfamily B member 6'),
 ('CYP2C19', 'cytochrome P450 family 2 subfamily C member 19'),
 ('CYP2C8', 'cytochrome P450 family 2 subfamily C member 8'),
 ('CYP2D6', 'cytochrome P450 family 2 subfamily D member 6'),
 ('PDHA2', 'pyruvate dehydrogenase (lipoamide) alpha 2'),
 ('CETP', 'cholesteryl ester transfer protein, plasma'),
 ('RUSC1', 'RUN and SH3 domain containing 1'),
 ('ZSWIM2', 'zinc finger SWIM-type containing 2'),
 ('SLCO1B3', 'solute carrier organic anion transporter family member 1B3'),
 ('BCAT2', 'branched chain amino-acid transaminase 2, mitochondrial'),
 ('TSHB', 'thyroid stimulating hormone beta'),
 ('HMGCR', '3-hydroxy-3-methylglutaryl-CoA reductase'),
 ('ACE', 'angiotensin I converting enzyme'),
 ('TMEM52B', 'transmembrane protein 52B'),
 ('GPR179', 'G protein-coupled

## TODO: Separate out metabolic genes

gg = goea_results_sig[0]
gg.GO

https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb

In [32]:
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj


# Plot subset starting from these significant GO terms
goid_subset = [g.GO for g in goea_results_sig] # [
#     'GO:0003723', # MF D04 RNA binding (32 genes)
#     'GO:0044822', # MF D05 poly(A) RNA binding (86 genes)
#     'GO:0003729', # MF D06 mRNA binding (11 genes)
#     'GO:0019843', # MF D05 rRNA binding (6 genes)
#     'GO:0003746', # MF D06 translation elongation factor activity (5 genes)
# ]
plot_gos("results3/out/go_enrich.png", 
    goid_subset, # Source GO ids
    obodag, 
    goea_results=goea_results_all) # Use pvals for coloring
# plot_results("nbt3102_BP.png", goea_results_sig)

    8 usr  40 GOs  WROTE: results3/out/go_enrich.png


This plot contains GOEA results:

- GO terms colored by P-value:
    - pval < 0.005 (light red)
    - pval < 0.01 (light orange)
    - pval < 0.05 (yellow)
    - pval > 0.05 (grey) Study terms that are not statistically significant
- GO terms with study gene counts printed. e.g., "32 genes"

In [34]:
plot_gos("results3/out/go_enrich_symbols.pdf", 
    goid_subset, # Source GO ids
    obodag,
    goea_results=goea_results_all, # use pvals for coloring
    # We can further configure the plot...
    id2symbol=geneid2symbol, # Print study gene Symbols, not Entrez GeneIDs
    study_items=6, # Only only 6 gene Symbols max on GO terms
    items_p_line=3, # Print 3 genes per line
    )

    8 usr  40 GOs  WROTE: results3/out/go_enrich_symbols.pdf


In [35]:
c = Compound.from_cid(5090)
c.iupac_name

'3-(4-methylsulfonylphenyl)-4-phenyl-2H-furan-5-one'

In [36]:
cids = [int(mapping.drug_idx_to_id[c].replace('CID','')) for c in data['pd_idx'][1]]
drugs = [Compound.from_cid(int(cid)) for cid in set(cids)]

In [37]:
drugs[0].cid

5732

In [38]:
drugs

[Compound(5732),
 Compound(4170),
 Compound(3002190),
 Compound(5039),
 Compound(2162),
 Compound(3325)]

In [39]:
dd = drugs[0]
dd.cid

5732

In [48]:
mapping.drug_id_to_idx['CID152946']

KeyError: 'CID152946'

In [40]:
drug_ids = [(mapping.drug_id_to_idx['CID{}'.format(d.cid)], d.cid, d.synonyms[0], d.iupac_name) for d in drugs]
drug_ids

[(104,
  5732,
  'zolpidem',
  'N,N-dimethyl-2-[6-methyl-2-(4-methylphenyl)imidazo[1,2-a]pyridin-3-yl]acetamide'),
 (155,
  4170,
  'metolazone',
  '7-chloro-2-methyl-3-(2-methylphenyl)-4-oxo-1,2-dihydroquinazoline-6-sulfonamide'),
 (172,
  3002190,
  'Telithromycin',
  '(1S,2R,5R,7R,8R,9R,11R,13R,14R)-8-[(2S,3R,4S,6R)-4-(dimethylamino)-3-hydroxy-6-methyloxan-2-yl]oxy-2-ethyl-9-methoxy-1,5,7,9,11,13-hexamethyl-15-[4-(4-pyridin-3-ylimidazol-1-yl)butyl]-3,17-dioxa-15-azabicyclo[12.3.0]heptadecane-4,6,12,16-tetrone'),
 (31,
  5039,
  'UNII-884KT10YB7',
  "1-N'-[2-[[5-[(dimethylamino)methyl]furan-2-yl]methylsulfanyl]ethyl]-1-N-methyl-2-nitroethene-1,1-diamine"),
 (34,
  2162,
  'amlodipine',
  '3-O-ethyl 5-O-methyl 2-(2-aminoethoxymethyl)-4-(2-chlorophenyl)-6-methyl-1,4-dihydropyridine-3,5-dicarboxylate'),
 (96,
  3325,
  'famotidine',
  "3-[[2-(diaminomethylideneamino)-1,3-thiazol-4-yl]methylsulfanyl]-N'-sulfamoylpropanimidamide")]

In [41]:
fps = [d.fingerprint for d in drugs]

In [42]:
geneid2symbol[5243]

'ABCB1'

# Zielinski et al. (2015) Nat Commun

In [43]:
#mapping.drug_id_to_idx['CID3559'] # haloperidol
#print(mapping.drug_id_to_idx['CID4091']) # metformin--doesn't exist
print(mapping.drug_id_to_idx['CID2801']) # clomipramine
print(mapping.drug_id_to_idx['CID5523']) # Ultram 

147
24


import binascii
bs = binascii.unhexlify(fps[0])
bs

from scipy.spatial import distance

bfps = [binascii.unhexlify(f) for f in fps]
# distance.rogerstanimoto(bfps[0],bfps[2])

bb = bfps[0]
bb.

bfps[0]

bfps[2]

# Data will be stored in this variable
import os
geneid2symbol = {}
# Get xlsx filename where data is stored
ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory
din_xlsx = "../data/nbt.3102-S4_GeneIDs.xlsx"
# Read data
if os.path.isfile(din_xlsx):  
    import xlrd
    book = xlrd.open_workbook(din_xlsx)
    pg = book.sheet_by_index(0)
    for r in range(pg.nrows):
        symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]
        if geneid:
            geneid2symbol[int(geneid)] = symbol
    print('{N} genes READ: {XLSX}'.format(N=len(geneid2symbol), XLSX=din_xlsx))
else:
    raise RuntimeError('FILE NOT FOUND: {XLSX}'.format(XLSX=din_xlsx))