# Paper next results
- validate link between arrhythmia, myopathy, and the identified genes (SLCO1B1)
- validate the drugs involved in arrhythmia
    - run PoSe-Path on the drugs found in literature for arrhythmia; check for other side effects (and check in literature)
- validate the other promising heart side effects
    - explain (jointly) related side effects (having similar mechanisms) in the heart and across organs (e.g., heart muscle, other muscle
- validate muscle (myopathy) since connected with cardiovascular disease
- move on to validating liver (since SLCO1B1 is in liver), kidney, (brain)

In [1]:
import goatools
import pandas as pd
import numpy as np
import scipy.stats as stats
import pubchempy
import pickle

from pubchempy import Compound

# Get http://geneontology.org/ontology/go-basic.obo
from goatools.base import download_go_basic_obo
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from __future__ import print_function
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

drugs = ['CID3883','CID4594','CID4679']

In [2]:
#with open('../data/[91.0, 88.0]-[84.0, 84.0]-[28.0, 28.0]-1.0.pkl','rb') as f:
#with open('results/out/pkl/all-all-28-1.0-0.99.pkl','rb') as f:
#with open('results2/out/pkl/100,34,88-100,34,88-28-1.0-0.5.pkl','rb') as f:
#with open('results3/out/pkl/147-all-all-3.0-0.96.pkl','rb') as f:
with open('results3/out/pkl/all-all-508-3.0-0.97.pkl','rb') as f:
    data = pickle.load(f)
dmap = data.copy()

In [3]:
with open('/home/laurence/git/posepath/PoSePath/src/tipexp_data.pkl','rb') as f:
    mapping = pickle.load(f)

In [4]:
# mapping.side_effect_idx_to_name

In [5]:
idx1 = data['pp_idx'][0]
idx2 = data['pp_idx'][1]

In [6]:
data['pp_id'] = [[int(mapping.prot_idx_to_id[idx].replace('GeneID','')) for idx in idx1],
                 [int(mapping.prot_idx_to_id[idx].replace('GeneID','')) for idx in idx2]]

In [7]:
# data

### Example 2: Clomipramine --> side effect --> PPI
- Drug 1: 147 (Clomipramine)
- Drug 2: 24 (all-->one)
- Side effect: 376 (aortic regurgitation)

Analyze after running below

In [8]:
set([mapping.side_effect_idx_to_name[k] for k in data['sd_idx']])

{'Arrhythmia'}

In [9]:
d2 = [mapping.drug_idx_to_id[k] for k in data['drug2']]

In [10]:
cids = [int(d.replace('CID','')) for d in d2]
drugs = [Compound.from_cid(int(cid)) for cid in set(cids)]

In [11]:
# drugs

In [12]:
# drug_ids = [(d.cid, d.synonyms[0], d.iupac_name) for d in drugs]
# drug_ids

Find the side effects to run PoSe-Path

In [13]:
hearts = ['arrhythmia','cardiac','heart','ventric','atri','aort']

In [14]:
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in hearts])]
ses

[('heart attack', 186),
 ('arteriosclerotic heart disease', 202),
 ('Extrasystoles Ventricular', 239),
 ('Atrioventricular block first degree', 259),
 ('Cardiac decompensation', 287),
 ('heart rate increased', 327),
 ('aortic regurgitation', 358),
 ('cardiac murmur', 398),
 ('cardiac enlargement', 399),
 ('cardiac disease', 448),
 ('atrial septal defect', 453),
 ('atrial ectopic beats', 486),
 ('Arrhythmia', 508),
 ('Cardiac ischemia', 516),
 ('cardiac failure', 527),
 ('atrioventricular block second degree', 528),
 ('tachycardia ventricular', 535),
 ('atrial flutter', 538),
 ('cardiac valvulopathy', 560),
 ('Atrioventricular block complete', 562),
 ('ventricular fibrillation', 563),
 ('Cardiac tamponade', 580),
 ('Supraventricular tachycardia', 594),
 ('right heart failure', 615),
 ('aortic stenosis', 642),
 ('atrioventricular block', 656),
 ('congenital heart disease', 686),
 ('aortic aneurysm', 740),
 ('left ventricular hypertrophy', 773),
 ('Bradyarrhythmia', 791)]

In [15]:
len(mapping.side_effect_idx_to_name)
name_to_idx = {v:k for k, v in mapping.side_effect_idx_to_name.items()}

In [16]:
','.join([str(x[1]) for x in ses])

'186,202,239,259,287,327,358,398,399,448,453,486,508,516,527,528,535,538,560,562,563,580,594,615,642,656,686,740,773,791'

In [17]:
renal = ['kidne','renal']
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in renal])]
ses

[('Chronic Kidney Disease', 143),
 ('acute kidney failure', 144),
 ('kidney failure', 256),
 ('renal cyst', 277),
 ('adrenal insufficiency', 301),
 ('disorder Renal', 333),
 ('renal tubular acidosis', 501),
 ('renal mass', 690),
 ('hepatorenal syndrome', 711),
 ('kidney pain', 730),
 ('kidney transplant', 811)]

In [57]:
muscle = ['muscl','myopath']
ses = [(k,v) for k,v in mapping.side_effect_name_to_idx.items() 
 if any([x.lower() in k.lower() for x in muscle])]
ses

[('muscle spasm', 126),
 ('muscle paresis', 137),
 ('aching muscles', 175),
 ('muscle weakness', 190),
 ('Cardiomyopathy', 283),
 ('muscle strain', 416),
 ('muscle inflammation', 488),
 ('muscle disorder', 533)]

### Muscle / Heart / transporter genes
https://ascpt.onlinelibrary.wiley.com/doi/abs/10.1038/clpt.2013.234


https://www.ncbi.nlm.nih.gov/pubmed/29463526
- potentially strong assocation between arrythmia and SLCO1B1 due to increased risk of prolonged QTc interval

### GO enrichment

In [18]:

obo_fname = download_go_basic_obo()


fin_gene2go = download_ncbi_associations()

obodag = GODag("go-basic.obo")

# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

# Get namespace2association where:
#    namespace is:
#        BP: biological_process               
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))


goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(), # List of human protein-acoding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-01-01) 47,337 GO Terms
HMS:0:00:02.977681 323,107 annotations READ: gene2go 
1 taxids stored: 9606
MF 17,384 annotated human genes
BP 17,541 annotated human genes
CC 18,648 annotated human genes

Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 80% 16,711 of 20,913 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 85% 17,755 of 20,913 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 80% 16,699 of 20,913 population items found in association


In [19]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
geneids_study = data['pp_id'][0] + data['pp_id'][1] # geneid2symbol.keys()
goea_results_all = goeaobj.run_study(geneids_study)


Run BP Gene Ontology Analysis: current study set of 6 IDs ...
100%      4 of      4 study items found in association
 67%      4 of      6 study items found in population(20913)
Calculating 12,189 uncorrected p-values using fisher_scipy_stats
  12,189 GO terms are associated with 16,711 of 20,913 population items
      18 GO terms are associated with      4 of      4 study items
  METHOD fdr_bh:
       4 GO terms found significant (< 0.05=alpha) (  4 enriched +   0 purified): statsmodels fdr_bh
       4 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 6 IDs ...
100%      4 of      4 study items found in association
 67%      4 of      6 study items found in population(20913)
Calculating 1,731 uncorrected p-values using fisher_scipy_stats
   1,731 GO terms are associated with 17,755 of 20,913 population items
      12 GO terms are associated with      4 of   

In [20]:
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

In [21]:
len(goea_results_sig)

8

In [22]:
g0 = goea_results_sig[0]
g0.goterm

GOTerm('GO:0015721'):
  id:GO:0015721
  item_id:GO:0015721
  name:bile acid and bile salt transport
  namespace:biological_process
  _parents: 3 items
    GO:0015718
    GO:0015850
    GO:0006869
  parents: 3 items
    GO:0015850	level-05	depth-05	organic hydroxy compound transport [biological_process]
    GO:0015718	level-07	depth-08	monocarboxylic acid transport [biological_process]
    GO:0006869	level-05	depth-05	lipid transport [biological_process]
  children: 1 items
    GO:0015722	level-06	depth-10	canalicular bile acid transport [biological_process]
  level:6
  depth:9
  is_obsolete:False
  alt_ids: 0 items

In [23]:
g0.goterm.__dict__['namespace']

'biological_process'

In [24]:
import pandas as pd

In [25]:
g0.__dict__['p_fdr_bh']

0.00010470005154705395

In [26]:
d1 = {'a':1}
d1.update({'b':2})
d1

{'a': 1, 'b': 2}

In [27]:
keys = ['name','namespace']
df_go1 = pd.DataFrame([
        {k:g.goterm.__dict__.get(k) for k in keys}
        for g in goea_results_sig])
df_p = pd.DataFrame([{'p_fdr_bh':g.__dict__['p_fdr_bh']} for g in goea_results_sig])
df_go = df_go1.merge(df_p, left_index=True, right_index=True)

In [28]:
df_go.groupby('namespace').count()

Unnamed: 0_level_0,name,p_fdr_bh
namespace,Unnamed: 1_level_1,Unnamed: 2_level_1
biological_process,4,4
cellular_component,1,1
molecular_function,3,3


In [45]:
df_go.sort_values(['p_fdr_bh','namespace','name'])

Unnamed: 0,name,namespace,p_fdr_bh
0,bile acid and bile salt transport,biological_process,0.000105
1,transmembrane transport,biological_process,0.0002
4,basolateral plasma membrane,cellular_component,0.005575
5,bile acid transmembrane transporter activity,molecular_function,0.00628
6,sodium-independent organic anion transmembrane...,molecular_function,0.006849
2,organic anion transport,biological_process,0.010025
3,sodium-independent organic anion transport,biological_process,0.010025
7,ATPase-coupled transmembrane transporter activity,molecular_function,0.037601


In [29]:
df_go[df_go.namespace=='molecular_function'].sort_values(
    ['p_fdr_bh','namespace','name'])

Unnamed: 0,name,namespace,p_fdr_bh
5,bile acid transmembrane transporter activity,molecular_function,0.00628
6,sodium-independent organic anion transmembrane...,molecular_function,0.006849
7,ATPase-coupled transmembrane transporter activity,molecular_function,0.037601


In [30]:
geneid2info = {v.GeneID: (v.Symbol,v.description) for k,v in GeneID2nt_hum.items()}

In [31]:
[geneid2info[g] for g in geneids_study if g in geneid2info]

[('ABCB11', 'ATP binding cassette subfamily B member 11'),
 ('ABCC4', 'ATP binding cassette subfamily C member 4'),
 ('ABCC4', 'ATP binding cassette subfamily C member 4'),
 ('SLCO1B3', 'solute carrier organic anion transporter family member 1B3'),
 ('SLCO1B3', 'solute carrier organic anion transporter family member 1B3'),
 ('SLCO1B1', 'solute carrier organic anion transporter family member 1B1')]

## TODO: Separate out metabolic genes

gg = goea_results_sig[0]
gg.GO

https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb

In [32]:
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj


# Plot subset starting from these significant GO terms
goid_subset = [g.GO for g in goea_results_sig] # [
#     'GO:0003723', # MF D04 RNA binding (32 genes)
#     'GO:0044822', # MF D05 poly(A) RNA binding (86 genes)
#     'GO:0003729', # MF D06 mRNA binding (11 genes)
#     'GO:0019843', # MF D05 rRNA binding (6 genes)
#     'GO:0003746', # MF D06 translation elongation factor activity (5 genes)
# ]
plot_gos("results3/out/go_enrich.png", 
    goid_subset, # Source GO ids
    obodag, 
    goea_results=goea_results_all) # Use pvals for coloring
# plot_results("nbt3102_BP.png", goea_results_sig)

    8 usr  40 GOs  WROTE: results3/out/go_enrich.png


In [33]:
geneid2symbol = {v.GeneID: v.Symbol for k,v in GeneID2nt_hum.items()}

This plot contains GOEA results:

- GO terms colored by P-value:
    - pval < 0.005 (light red)
    - pval < 0.01 (light orange)
    - pval < 0.05 (yellow)
    - pval > 0.05 (grey) Study terms that are not statistically significant
- GO terms with study gene counts printed. e.g., "32 genes"

In [34]:
plot_gos("results3/out/go_enrich_symbols.pdf", 
    goid_subset, # Source GO ids
    obodag,
    goea_results=goea_results_all, # use pvals for coloring
    # We can further configure the plot...
    id2symbol=geneid2symbol, # Print study gene Symbols, not Entrez GeneIDs
    study_items=6, # Only only 6 gene Symbols max on GO terms
    items_p_line=3, # Print 3 genes per line
    )

    8 usr  40 GOs  WROTE: results3/out/go_enrich_symbols.pdf


In [35]:
c = Compound.from_cid(5090)
c.iupac_name

'3-(4-methylsulfonylphenyl)-4-phenyl-2H-furan-5-one'

In [36]:
cids = [int(mapping.drug_idx_to_id[c].replace('CID','')) for c in data['pd_idx'][1]]
drugs = [Compound.from_cid(int(cid)) for cid in set(cids)]

In [37]:
drugs[0].cid

5732

In [38]:
drugs

[Compound(5732),
 Compound(4170),
 Compound(3002190),
 Compound(5039),
 Compound(2162),
 Compound(3325)]

In [39]:
dd = drugs[0]
dd.cid

5732

In [48]:
mapping.drug_id_to_idx['CID152946']

KeyError: 'CID152946'

In [40]:
drug_ids = [(mapping.drug_id_to_idx['CID{}'.format(d.cid)], d.cid, d.synonyms[0], d.iupac_name) for d in drugs]
drug_ids

[(104,
  5732,
  'zolpidem',
  'N,N-dimethyl-2-[6-methyl-2-(4-methylphenyl)imidazo[1,2-a]pyridin-3-yl]acetamide'),
 (155,
  4170,
  'metolazone',
  '7-chloro-2-methyl-3-(2-methylphenyl)-4-oxo-1,2-dihydroquinazoline-6-sulfonamide'),
 (172,
  3002190,
  'Telithromycin',
  '(1S,2R,5R,7R,8R,9R,11R,13R,14R)-8-[(2S,3R,4S,6R)-4-(dimethylamino)-3-hydroxy-6-methyloxan-2-yl]oxy-2-ethyl-9-methoxy-1,5,7,9,11,13-hexamethyl-15-[4-(4-pyridin-3-ylimidazol-1-yl)butyl]-3,17-dioxa-15-azabicyclo[12.3.0]heptadecane-4,6,12,16-tetrone'),
 (31,
  5039,
  'UNII-884KT10YB7',
  "1-N'-[2-[[5-[(dimethylamino)methyl]furan-2-yl]methylsulfanyl]ethyl]-1-N-methyl-2-nitroethene-1,1-diamine"),
 (34,
  2162,
  'amlodipine',
  '3-O-ethyl 5-O-methyl 2-(2-aminoethoxymethyl)-4-(2-chlorophenyl)-6-methyl-1,4-dihydropyridine-3,5-dicarboxylate'),
 (96,
  3325,
  'famotidine',
  "3-[[2-(diaminomethylideneamino)-1,3-thiazol-4-yl]methylsulfanyl]-N'-sulfamoylpropanimidamide")]

In [41]:
fps = [d.fingerprint for d in drugs]

In [42]:
geneid2symbol[5243]

'ABCB1'

# Zielinski et al. (2015) Nat Commun

In [43]:
#mapping.drug_id_to_idx['CID3559'] # haloperidol
#print(mapping.drug_id_to_idx['CID4091']) # metformin--doesn't exist
print(mapping.drug_id_to_idx['CID2801']) # clomipramine
print(mapping.drug_id_to_idx['CID5523']) # Ultram 

147
24


import binascii
bs = binascii.unhexlify(fps[0])
bs

from scipy.spatial import distance

bfps = [binascii.unhexlify(f) for f in fps]
# distance.rogerstanimoto(bfps[0],bfps[2])

bb = bfps[0]
bb.

bfps[0]

bfps[2]

# Data will be stored in this variable
import os
geneid2symbol = {}
# Get xlsx filename where data is stored
ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory
din_xlsx = "../data/nbt.3102-S4_GeneIDs.xlsx"
# Read data
if os.path.isfile(din_xlsx):  
    import xlrd
    book = xlrd.open_workbook(din_xlsx)
    pg = book.sheet_by_index(0)
    for r in range(pg.nrows):
        symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]
        if geneid:
            geneid2symbol[int(geneid)] = symbol
    print('{N} genes READ: {XLSX}'.format(N=len(geneid2symbol), XLSX=din_xlsx))
else:
    raise RuntimeError('FILE NOT FOUND: {XLSX}'.format(XLSX=din_xlsx))