In [1]:
import os, sys
import numpy as np
import pandas as pd
from sqlalchemy.types import NVARCHAR, Text, Integer, Float

# Load data from generated from `MS_EMR/scripts` about drugs, diagnoses co-occurence.

In [2]:
id_ingredients = pd.read_csv('../data/single_ingredients.csv').set_index('id')
print id_ingredients.shape
id_ingredients.head()

(1588, 1)


Unnamed: 0_level_0,Ingredient
id,Unnamed: 1_level_1
0,5-HYDROXYTRYPTOPHAN
1,6-AMINOCAPROIC ACID
2,ABACAVIR
3,ABATACEPT
4,ABCIXIMAB


In [3]:
id_diagnoses = pd.read_csv('../data/all_ICD9s_explained.csv').set_index('id')
print id_diagnoses.shape
id_diagnoses.head()

(14353, 2)


Unnamed: 0_level_0,ICD9,diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,813.41,Closed Colles' fracture
1,E849.8,Accidents occurring in other specified places
2,E885.9,"Fall from other slipping, tripping, or stumbling"
3,424.1,Aortic valve disorders
4,155.0,"Malignant neoplasm of liver, primary"


In [4]:
id_diagnoses.tail()

Unnamed: 0_level_0,ICD9,diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1
14580,200.46,"Mantle cell lymphoma, intrapelvic lymph nodes"
14581,749.2,Cleft palate with cleft lip
14582,948.61,Burn [any degree] involving 60-69 percent of b...
14583,979.6,Poisoning by other and unspecified viral and r...
14584,789.4,Abdominal rigidity


# Load data from RepurposeHub to map pert_id to drug names

In [5]:
repo_df = pd.read_csv('../../Repurposing_Hub_export.txt', sep='\t').set_index('Name')
print repo_df.shape
repo_df.head()

(5628, 4)


Unnamed: 0_level_0,MOA,Target,Id,Phase
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A-317491,purinergic receptor antagonist,P2RX3,"BRD-K38019854-323-01-4, BRD-K38019854-001-01-6",Preclinical
A-33903,,,BRD-A40302156-001-01-9,Phase 2
A-366,histone lysine methyltransferase inhibitor,"EHMT1, EHMT2","BRD-K06182768-001-02-3, BRD-K06182768-001-01-5...",Preclinical
A-674563,AKT inhibitor,"AKT1, PKIA, PRKACA",BRD-K78177893-001-02-4,Preclinical
A-7,calmodulin antagonist,,BRD-K03301001-003-02-5,Preclinical


In [6]:
repo_df['pert_ids'] = repo_df['Id'].map(lambda x: set(['-'.join(s.split('-')[0:2]) for s in x.split(', ')]))
repo_df.head()

Unnamed: 0_level_0,MOA,Target,Id,Phase,pert_ids
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A-317491,purinergic receptor antagonist,P2RX3,"BRD-K38019854-323-01-4, BRD-K38019854-001-01-6",Preclinical,{BRD-K38019854}
A-33903,,,BRD-A40302156-001-01-9,Phase 2,{BRD-A40302156}
A-366,histone lysine methyltransferase inhibitor,"EHMT1, EHMT2","BRD-K06182768-001-02-3, BRD-K06182768-001-01-5...",Preclinical,{BRD-K06182768}
A-674563,AKT inhibitor,"AKT1, PKIA, PRKACA",BRD-K78177893-001-02-4,Preclinical,{BRD-K78177893}
A-7,calmodulin antagonist,,BRD-K03301001-003-02-5,Preclinical,{BRD-K03301001}


In [7]:
d_pert_id_name = {}
for name, row in repo_df.iterrows():
    for pert_id in row['pert_ids']:
        d_pert_id_name[pert_id] = name

print len(d_pert_id_name)        
repo_df = repo_df.drop(['pert_ids', 'Id'], axis=1)

6172


In [8]:
# Make a dataframe from repo_df indexed by pert_id
repo_df_by_pert = []
for pert_id, name in d_pert_id_name.items():
    rec = repo_df.loc[name].to_dict()
    rec['pert_id'] = pert_id
    rec['Name'] = name
    repo_df_by_pert.append(rec)

repo_df_by_pert = pd.DataFrame(repo_df_by_pert).set_index('pert_id')
print repo_df_by_pert.shape
repo_df_by_pert.head()

(6172, 4)


Unnamed: 0_level_0,MOA,Name,Phase,Target
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BRD-K15916496,"cytochrome P450 inhibitor, imidazoline recepto...",clotrimazole,Launched,"CYP3A4, KCNN4, NR1I2, NR1I3, TRPM2, TRPM4, TRPM8"
BRD-K84459715,,zopolrestat,Phase 2,
BRD-K32289541,Ras GTPase inhibitor,EHop-016,Preclinical,"RAC1, RAC3"
BRD-A55312468,ATPase inhibitor,k-strophanthidin,Phase 2,ATP1A1
BRD-K64874225,"ACAT inhibitor, sterol regulatory element bind...",NSC-4644,Phase 2,PYGM


# Load the metadata about pert_ids from `euclid4.drug`

In [10]:
from sqlalchemy import create_engine
engine = create_engine('mysql://euclid:elements@amp.pharm.mssm.edu:3306/euclid4?charset=utf8')
euclid4_drugs_df = pd.read_sql('drug', engine, index_col='pert_id')
print euclid4_drugs_df.shape
euclid4_drugs_df.head()

(20449, 16)


Unnamed: 0_level_0,alt_name,pert_iname,LSM_id,mls_id,ncgc_id,pert_collection,pert_icollection,pert_summary,pert_url,pubchem_cid,canonical_smiles,inchi_key,inchi_string,molecular_formula,molecular_wt,structure_url
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
BRD-A00100033,,nifurtimox,LSM-1232,,,BIOA,BIOA,,http://en.wikipedia.org/wiki/Nifurtimox,6842999,CC1CS(=O)(=O)CCN1N=Cc2ccc(o2)[N+](=O)[O-],InChIKey=ARFHIAQFJWUCFH-UHFFFAOYSA-N,"InChI=1S/C10H13N3O5S/c1-8-7-19(16,17)5-4-12(8)...",C10H13N3O5S,287.292,http://data.lincscloud.org/pert_images/BRD-A00...
BRD-A00150179,,5-hydroxytryptophan,,,,BIOA,BIOA,,,589768,NC(Cc1c[nH]c2cccc(O)c12)C(O)=O,InChIKey=QSHLMQDRPXXYEE-UHFFFAOYSA-N,InChI=1S/C11H12N2O3/c12-7(11(15)16)4-6-5-13-8-...,C11H12N2O3,220.225,http://data.lincscloud.org/pert_images/BRD-A00...
BRD-A00267231,,hemado,LSM-1233,,,BIOA,BIOA,,http://www.sigmaaldrich.com/catalog/product/si...,4043357,CCCCC#Cc1nc(NC)c2ncn(C3OC(CO)C(O)C3O)c2n1,InChIKey=KOCIMZNSNPOGOP-UHFFFAOYSA-N,InChI=1S/C17H23N5O4/c1-3-4-5-6-7-11-20-15(18-2...,C17H23N5O4,361.396,http://data.lincscloud.org/pert_images/BRD-A00...
BRD-A00420644,SA-3676,SA-3676,LSM-6366,,,COMB,MLPCN,,,2853908,CCN1C2C(C(=NC2Nc3ccccc13)OC)c4ccccc4,InChIKey=ASCBUEVCEVGOFP-UHFFFAOYSA-N,InChI=1S/C19H21N3O/c1-3-22-15-12-8-7-11-14(15)...,C19H21N3O,307.389,http://data.lincscloud.org/pert_images/BRD-A00...
BRD-A00474148,,BRD-A00474148,LSM-1234,MLS002703114,NCGC00187778-01,STRD,BIOA,,,44825297,Oc1ccc(cc1)N1CCN(CC1)[S+]([O-])(=O)c1ccc2NC(=O...,InChIKey=RCGAUPRLRFZAMS-UHFFFAOYSA-N,InChI=1S/C18H19N3O4S/c22-15-3-1-14(2-4-15)20-7...,C18H19N3O4S,373.426,http://data.lincscloud.org/pert_images/BRD-A00...


In [11]:
print len(np.intersect1d(euclid4_drugs_df.index, repo_df_by_pert.index))

2121


In [12]:
shared_pert_ids = np.intersect1d(euclid4_drugs_df.index, repo_df_by_pert.index)
print len(shared_pert_ids)
pert_ids_uniq_in_repo = np.setdiff1d(repo_df_by_pert.index, euclid4_drugs_df.index)
print len(pert_ids_uniq_in_repo)

2121
4051


In [13]:
repo_df_by_pert.loc[shared_pert_ids]['Phase'].value_counts()

Launched           1118
Preclinical         630
Phase 2             129
Phase 1              81
Phase 3              79
Withdrawn            62
Phase 2/Phase 3      11
Phase 1/Phase 2      11
Name: Phase, dtype: int64

In [14]:
repo_df_by_pert.loc[pert_ids_uniq_in_repo]['Phase'].value_counts()

Launched           1667
Preclinical        1000
Phase 2             568
Phase 1             386
Phase 3             322
Phase 1/Phase 2      44
Withdrawn            36
Phase 2/Phase 3      27
Name: Phase, dtype: int64

In [15]:
drug_names_shared = set(repo_df_by_pert.loc[shared_pert_ids]['Name']) 
drug_names_uniq_in_repo = set(repo_df_by_pert.loc[pert_ids_uniq_in_repo]['Name']) 
print len(drug_names_shared), len(drug_names_uniq_in_repo)
print len(drug_names_shared & drug_names_uniq_in_repo)

2075 3718
176


In [16]:
repo_df_by_pert.query('Phase == "Launched"')['Name'].nunique()

2341

In [17]:
names_in_repo = set(map(lambda x:x.upper(), repo_df_by_pert['Name']))
names_in_euclid = set(map(lambda x:x.upper(), euclid4_drugs_df['pert_iname']))
print len(names_in_repo), len(names_in_euclid)
print len(names_in_repo & names_in_euclid)

5617 19799
2087


In [18]:
repo_df_by_pert.head()

Unnamed: 0_level_0,MOA,Name,Phase,Target
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BRD-K15916496,"cytochrome P450 inhibitor, imidazoline recepto...",clotrimazole,Launched,"CYP3A4, KCNN4, NR1I2, NR1I3, TRPM2, TRPM4, TRPM8"
BRD-K84459715,,zopolrestat,Phase 2,
BRD-K32289541,Ras GTPase inhibitor,EHop-016,Preclinical,"RAC1, RAC3"
BRD-A55312468,ATPase inhibitor,k-strophanthidin,Phase 2,ATP1A1
BRD-K64874225,"ACAT inhibitor, sterol regulatory element bind...",NSC-4644,Phase 2,PYGM


In [19]:
euclid4_drugs_df_merged = euclid4_drugs_df.merge(repo_df_by_pert[['MOA', 'Target', 'Phase']], 
                                                 left_index=True, right_index=True,
                                                 how='left')
print euclid4_drugs_df_merged.shape
euclid4_drugs_df_merged.head()

(20449, 19)


Unnamed: 0_level_0,alt_name,pert_iname,LSM_id,mls_id,ncgc_id,pert_collection,pert_icollection,pert_summary,pert_url,pubchem_cid,canonical_smiles,inchi_key,inchi_string,molecular_formula,molecular_wt,structure_url,MOA,Target,Phase
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
BRD-A00100033,,nifurtimox,LSM-1232,,,BIOA,BIOA,,http://en.wikipedia.org/wiki/Nifurtimox,6842999,CC1CS(=O)(=O)CCN1N=Cc2ccc(o2)[N+](=O)[O-],InChIKey=ARFHIAQFJWUCFH-UHFFFAOYSA-N,"InChI=1S/C10H13N3O5S/c1-8-7-19(16,17)5-4-12(8)...",C10H13N3O5S,287.292,http://data.lincscloud.org/pert_images/BRD-A00...,DNA inhibitor,,Launched
BRD-A00150179,,5-hydroxytryptophan,,,,BIOA,BIOA,,,589768,NC(Cc1c[nH]c2cccc(O)c12)C(O)=O,InChIKey=QSHLMQDRPXXYEE-UHFFFAOYSA-N,InChI=1S/C11H12N2O3/c12-7(11(15)16)4-6-5-13-8-...,C11H12N2O3,220.225,http://data.lincscloud.org/pert_images/BRD-A00...,,,
BRD-A00267231,,hemado,LSM-1233,,,BIOA,BIOA,,http://www.sigmaaldrich.com/catalog/product/si...,4043357,CCCCC#Cc1nc(NC)c2ncn(C3OC(CO)C(O)C3O)c2n1,InChIKey=KOCIMZNSNPOGOP-UHFFFAOYSA-N,InChI=1S/C17H23N5O4/c1-3-4-5-6-7-11-20-15(18-2...,C17H23N5O4,361.396,http://data.lincscloud.org/pert_images/BRD-A00...,,,
BRD-A00420644,SA-3676,SA-3676,LSM-6366,,,COMB,MLPCN,,,2853908,CCN1C2C(C(=NC2Nc3ccccc13)OC)c4ccccc4,InChIKey=ASCBUEVCEVGOFP-UHFFFAOYSA-N,InChI=1S/C19H21N3O/c1-3-22-15-12-8-7-11-14(15)...,C19H21N3O,307.389,http://data.lincscloud.org/pert_images/BRD-A00...,,,
BRD-A00474148,,BRD-A00474148,LSM-1234,MLS002703114,NCGC00187778-01,STRD,BIOA,,,44825297,Oc1ccc(cc1)N1CCN(CC1)[S+]([O-])(=O)c1ccc2NC(=O...,InChIKey=RCGAUPRLRFZAMS-UHFFFAOYSA-N,InChI=1S/C18H19N3O4S/c22-15-3-1-14(2-4-15)20-7...,C18H19N3O4S,373.426,http://data.lincscloud.org/pert_images/BRD-A00...,,,


In [20]:

euclid4_drugs_df_merged.rename(index=str, 
                               columns={'MOA': 'moa', 'Target': 'target', 'Phase': 'phase', 'Ingredient_id': 'ingredient_id'},
                               inplace=True)
# euclid4_drugs_df_merged.to_sql('drug_repurposedb', engine, 
#                                if_exists='replace',
#                                dtype={
#                                    'pert_id':NVARCHAR(32),
#                                    'alt_name':NVARCHAR(255),
#                                    'pert_iname':NVARCHAR(255),
#                                    'LSM_id':NVARCHAR(16),
#                                    'mls_id':NVARCHAR(16),
#                                    'ncgc_id':NVARCHAR(16),
#                                    'pert_collection':NVARCHAR(16),
#                                    'pert_icollection':NVARCHAR(16),
#                                    'pert_summary': Text,
#                                    'pert_url': Text,
#                                    'pubchem_cid':NVARCHAR(16),
#                                    'canonical_smiles':Text,
#                                    'inchi_key': Text,
#                                    'inchi_string':Text,
#                                    'molecular_formular':Text,
#                                    'molecular_wt': Float,
#                                    'structure_url':Text,
#                                    'moa': NVARCHAR(255),
#                                    'target': Text,
#                                    'phase': NVARCHAR(255),
#                                    'ingredient_id': Integer,
#                                })

In [21]:
names_in_ingredints = set(id_ingredients['Ingredient'])
print len(names_in_ingredints)

print len(names_in_repo & names_in_ingredints)
print len(names_in_euclid & names_in_ingredints)

1588
882
667


In [22]:
len(names_in_repo & names_in_euclid & names_in_ingredints)

637

In [23]:
# Make a synonyms dict for all pert_ids
name_pert_id_df = repo_df_by_pert['Name'].to_frame().reset_index()
name_pert_id_df['Name'] = name_pert_id_df['Name'].map(lambda x:x.upper())
print name_pert_id_df.shape
name_pert_id_df.tail()

(6172, 2)


Unnamed: 0,pert_id,Name
6167,BRD-K46142322,RS-67333
6168,BRD-K84996356,AZD3514
6169,BRD-K68810443,ADIPORON
6170,BRD-K13888115,LY2365109
6171,BRD-K92588747,2-CMDO


In [24]:
name_pert_id_df2 = euclid4_drugs_df['pert_iname'].to_frame().reset_index()
print name_pert_id_df2.shape
name_pert_id_df2 = name_pert_id_df2.loc[name_pert_id_df2['pert_iname'] != name_pert_id_df2['pert_id']]
name_pert_id_df2['pert_iname'] =  name_pert_id_df2['pert_iname'].map(lambda x:x.upper())
name_pert_id_df2.rename(index=str, columns={'pert_iname': 'Name'}, inplace=True )
print name_pert_id_df2.shape
name_pert_id_df2.head()

(20449, 2)
(4559, 2)


Unnamed: 0,pert_id,Name
0,BRD-A00100033,NIFURTIMOX
1,BRD-A00150179,5-HYDROXYTRYPTOPHAN
2,BRD-A00267231,HEMADO
3,BRD-A00420644,SA-3676
5,BRD-A00520476,OTENZEPAD


In [25]:
name_pert_id_df3 = []
for pert_id, row in euclid4_drugs_df.query('alt_name != "NULL"').iterrows():
    alt_names = row['alt_name']
    if alt_names:
        for alt_name in alt_names.upper().split('|'):
            if alt_name != '':
                name_pert_id_df3.append({'Name':alt_name.strip(), 'pert_id':pert_id})

name_pert_id_df3 = pd.DataFrame(name_pert_id_df3)
print name_pert_id_df3.shape
name_pert_id_df3 = name_pert_id_df3[['pert_id', 'Name']]
name_pert_id_df3.head()

(3289, 2)


Unnamed: 0,pert_id,Name
0,BRD-A00420644,SA-3676
1,BRD-A00520476,AF-DX 116
2,BRD-A00546892,S1285
3,BRD-A00758722,NORETHYNODREL
4,BRD-A00827783,DIPROPHYLLINE


In [26]:
name_pert_id_df3.sort_values('Name')[:10]

Unnamed: 0,pert_id,Name
906,BRD-K08486545,(+)-CYMARIN
685,BRD-A99182808,(+)-USNIC-ACID
549,BRD-A75455249,(+/-)-KAVAIN
464,BRD-A64228451,(-)-TERREIC-ACID
2009,BRD-K54028654,(D)-(+)-TREHALOSE
650,BRD-A91555231,(L)-(-)-NOREPINEPHRINE
1773,BRD-K44993696,(R)-(+)-ATENOLOL
159,BRD-A20589515,"(RS)-3,5-DHPG"
1066,BRD-K14329163,(S)-(-)-BAY K 8644
2757,BRD-K81521265,"1,3-DICYCLOHEXYLUREA"


In [27]:
names_pert_id_df_full = name_pert_id_df.append(name_pert_id_df2).append(name_pert_id_df3)
print names_pert_id_df_full.shape
names_pert_id_df_full.drop_duplicates(inplace=True)
print names_pert_id_df_full.shape


(14020, 2)
(10798, 2)


In [28]:
print names_pert_id_df_full['pert_id'].nunique()
print names_pert_id_df_full['Name'].nunique()
names_pert_id_df_full.sort_values('Name').head()

8729
8890


Unnamed: 0,pert_id,Name
1152,BRD-A76934284,(+)-3-(1-PROPYL-PIPERIDIN-3-YL)-PHENOL
906,BRD-K08486545,(+)-CYMARIN
685,BRD-A99182808,(+)-USNIC-ACID
283,BRD-A18795974,"(+/-)-7-HYDROXY-2-(N,N-DI-N-PROPYLAMINO)TETRALIN"
549,BRD-A75455249,(+/-)-KAVAIN


In [29]:
print max(map(len, names_pert_id_df_full['pert_id'].unique()))

20


In [30]:
names_pert_id_df_full.index.name = 'id'
# names_pert_id_df_full.to_sql('drug_synonyms', engine, 
#                              if_exists='replace', index=True,
#                              dtype={
#                                  'id': Integer,
#                                  'pert_id': NVARCHAR(20)}
#                             )

In [31]:
print len(set(names_pert_id_df_full['Name'])), len(set(id_ingredients['Ingredient']))
print len(set(names_pert_id_df_full['Name']) & set(id_ingredients['Ingredient']))

8890 1588
931


In [32]:
id_ingredients_merged = id_ingredients.reset_index().merge(names_pert_id_df_full, 
                                             left_on='Ingredient',
                                             right_on='Name',
                                             how='left'
                                            )
id_ingredients_merged = id_ingredients_merged.drop(['Name'], axis=1)
id_ingredients_merged.index.name = 'associtaion_id'
print id_ingredients_merged.shape
id_ingredients_merged.head(30)

(2229, 3)


Unnamed: 0_level_0,id,Ingredient,pert_id
associtaion_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,5-HYDROXYTRYPTOPHAN,BRD-A73930134
1,0,5-HYDROXYTRYPTOPHAN,BRD-A00150179
2,1,6-AMINOCAPROIC ACID,
3,2,ABACAVIR,BRD-A95032015
4,2,ABACAVIR,BRD-K17443395
5,3,ABATACEPT,
6,4,ABCIXIMAB,
7,5,ABIRATERONE,BRD-K00111504
8,5,ABIRATERONE,BRD-K55301415
9,6,ABOBOTULINUMTOXINA,


In [33]:
for col in id_ingredients_merged.columns:
    print col, id_ingredients_merged[col].nunique()

id 1588
Ingredient 1588
pert_id 1564


In [33]:
# Write to a SQL table
# id_ingredients_merged.to_sql('drug_map', engine, if_exists='replace', dtype={'pert_id': NVARCHAR(20)})

In [34]:
id_ingredients_merged = pd.read_sql_table('drug_map', engine)

In [35]:
print id_ingredients_merged[['id', 'pert_id']].drop_duplicates().shape

(2229, 2)


In [36]:
id_ingredients_merged.dropna(axis=0).count()

associtaion_id    1572
id                1572
Ingredient        1572
pert_id           1572
dtype: int64

In [37]:
for col in id_ingredients_merged.dropna(axis=0).columns:
    print col, id_ingredients_merged.dropna(axis=0)[col].nunique()

associtaion_id 1572
id 931
Ingredient 931
pert_id 1564


In [38]:
d_pert_id_ingredient_id = dict(zip(id_ingredients_merged.dropna(axis=0)['pert_id'], 
                                   id_ingredients_merged.dropna(axis=0)['id']))
print len(d_pert_id_ingredient_id)

1564


In [39]:
d_ingredient_id_pert_id = dict(zip(id_ingredients_merged.dropna(axis=0)['id'], 
                                   id_ingredients_merged.dropna(axis=0)['pert_id']))
print len(d_ingredient_id_pert_id)

931


In [40]:
euclid4_drugs_df_merged['ingredient_id'] = [d_pert_id_ingredient_id.get(pert_id, None) 
                                            for pert_id in euclid4_drugs_df_merged.index]
print euclid4_drugs_df_merged.shape

(20449, 20)


In [41]:
max(euclid4_drugs_df_merged.index.map(len))

20

In [42]:
euclid4_drugs_df_merged.count()

alt_name             20433
pert_iname           20449
LSM_id               20449
mls_id               20412
ncgc_id              20412
pert_collection      20412
pert_icollection     20412
pert_summary         20412
pert_url             20412
pubchem_cid          20445
canonical_smiles     20447
inchi_key            20412
inchi_string         20447
molecular_formula    20447
molecular_wt         20447
structure_url        20449
moa                   2090
target                1764
phase                 2121
ingredient_id         1038
dtype: int64

In [43]:
euclid4_drugs_df_merged = euclid4_drugs_df_merged.replace('NULL', np.nan)
euclid4_drugs_df_merged.count()

alt_name              2612
pert_iname           20449
LSM_id               13833
mls_id                1326
ncgc_id                503
pert_collection      20387
pert_icollection     20376
pert_summary           830
pert_url              2950
pubchem_cid          20364
canonical_smiles     20365
inchi_key            20330
inchi_string         20365
molecular_formula    20365
molecular_wt         20447
structure_url        20367
moa                   2090
target                1764
phase                 2121
ingredient_id         1038
dtype: int64

In [44]:
euclid4_drugs_df_merged = euclid4_drugs_df_merged.replace('', np.nan)
euclid4_drugs_df_merged.count()

alt_name              2612
pert_iname           20449
LSM_id               13833
mls_id                1326
ncgc_id                503
pert_collection      20387
pert_icollection     20376
pert_summary           830
pert_url              2950
pubchem_cid          20364
canonical_smiles     20365
inchi_key            20330
inchi_string         20365
molecular_formula    20365
molecular_wt         20447
structure_url        20367
moa                   2090
target                1764
phase                 2121
ingredient_id         1038
dtype: int64

In [44]:
# euclid4_drugs_df_merged.to_sql('drug_repurposedb', engine, 
#                                if_exists='replace',
#                                dtype={
#                                    'pert_id':NVARCHAR(20),
#                                    'alt_name':NVARCHAR(255),
#                                    'pert_iname':NVARCHAR(255),
#                                    'LSM_id':NVARCHAR(16),
#                                    'mls_id':NVARCHAR(16),
#                                    'ncgc_id':NVARCHAR(16),
#                                    'pert_collection':NVARCHAR(16),
#                                    'pert_icollection':NVARCHAR(16),
#                                    'pert_summary': Text,
#                                    'pert_url': Text,
#                                    'pubchem_cid':NVARCHAR(16),
#                                    'canonical_smiles':Text,
#                                    'inchi_key': Text,
#                                    'inchi_string':Text,
#                                    'molecular_formular':Text,
#                                    'molecular_wt': Float,
#                                    'structure_url':Text,
#                                    'moa': NVARCHAR(255),
#                                    'target': Text,
#                                    'phase': NVARCHAR(255),
#                                    'ingredient_id': Integer,
#                                })

# Find the most frequent co-prescribed drug and diagnosis for pert_ids

In [45]:
rx_dx_counts = np.loadtxt('../data/rx_dx_count_matrix.txt', dtype=np.int)
print rx_dx_counts.shape, rx_dx_counts.dtype

(1588, 14585) int64


In [46]:
rx_rx_counts = np.loadtxt('../data/rx_count_matrix.txt', dtype=np.int)
print rx_rx_counts.shape, rx_rx_counts.dtype

(1588, 1588) int64


In [47]:
d_id_ingredient = dict(zip(id_ingredients.index, id_ingredients['Ingredient']))
print len(d_id_ingredient)
d_id_diagnosis = dict(zip(id_diagnoses.index, id_diagnoses['diagnosis']))
print len(d_id_diagnosis)

1588
14353


In [67]:
print min(d_id_ingredient.keys()), max(d_id_ingredient.keys())
print min(d_id_diagnosis.keys()), max(d_id_diagnosis.keys())

0 1587
0 14584


# Load the total counts for Rx and Dx

In [48]:
engine_laforge = create_engine('mysql://wangz10:collab2016@la-forge.mssm.edu/proj_aging')
dx_total_counts = pd.read_sql_query('''
SELECT * FROM dx_id_counts
''', engine_laforge).set_index('id')
print dx_total_counts.shape
dx_total_counts.head()

(14353, 1)


Unnamed: 0_level_0,COUNT(`id`)
id,Unnamed: 1_level_1
0,1014
1,26607
2,17425
3,91959
4,90189


In [49]:
print (dx_total_counts['COUNT(`id`)'] > 0 ).sum()
# convert to an array
dx_total_counts_array = np.zeros(rx_dx_counts.shape[1], dtype=np.int)
dx_total_counts_array[dx_total_counts.index.values] = dx_total_counts['COUNT(`id`)']
print dx_total_counts_array.shape

14353
(14585,)


In [50]:
rx_total_counts = pd.read_sql_query('''
SELECT * FROM ingredient_id_counts
''', engine_laforge).set_index('ingredient_id')
print rx_total_counts.shape
rx_total_counts.head()

(1588, 1)


Unnamed: 0_level_0,count
ingredient_id,Unnamed: 1_level_1
0,3
1,11
2,30040
3,199
4,33


In [54]:
print (rx_total_counts['count'] > 0 ).sum()
# convert to an array
rx_total_counts_array = np.zeros(rx_rx_counts.shape[1], dtype=np.int)
rx_total_counts_array[rx_total_counts.index.values] = rx_total_counts['count']
print rx_total_counts_array.shape

1588
(1588,)


In [61]:
# Use the total counts to normalize the co-occurence matrices
non_zero_mask = dx_total_counts_array > 0
rx_dx_counts_normed = np.zeros_like(rx_dx_counts, dtype=np.float)
rx_dx_counts_normed[:, non_zero_mask] = rx_dx_counts[:, non_zero_mask] / dx_total_counts_array[non_zero_mask].astype(np.float)
print rx_dx_counts_normed.shape

(1588, 14585)


In [62]:
rx_rx_counts_normed = rx_rx_counts / rx_total_counts_array.astype(np.float)
print rx_rx_counts_normed.shape


(1588, 1588)


In [64]:
pert_id = 'BRD-K92049597'
ingredient_id = d_pert_id_ingredient_id[pert_id]
print ingredient_id
dx_counts = rx_dx_counts_normed[ingredient_id]
rx_counts = rx_rx_counts_normed[ingredient_id]
print rx_counts.shape
# print np.argmax(dx_counts)
# print d_id_diagnosis[np.argmax(dx_counts)]
print np.argmax(rx_counts)
print d_id_ingredient[np.argmax(rx_counts)]

1493
(1588,)
1227
PUMPKIN SEED OIL


In [65]:
print rx_counts[:5]
print rx_counts[np.argmax(rx_counts)], rx_counts.max()

[ 0.          0.          0.00086551  0.01005025  0.        ]
0.5 0.5


In [81]:
print rx_rx_counts[1493, 1239], rx_rx_counts[1239, 1493]
print rx_rx_counts_normed[1493, 1239], rx_rx_counts_normed[1239, 1493]
print rx_total_counts_array[1493], rx_total_counts_array[1239]
print rx_total_counts_array.sum()

163 163
0.0177096914385 0.0163097858715
9994 9204
44685719


In [82]:
from scipy.stats import fisher_exact

In [85]:
print fisher_exact([[163, 9994], [9204, 44685719]])
print fisher_exact([[163, 9204], [9994, 44685719]])

(79.184540243920367, 1.2360219040711674e-240)
(79.184540243920367, 1.2360219792577169e-240)


In [96]:
from fisher import pvalue

In [104]:
%%timeit
pvalue(163, 9994, 9204, 44685719)

100 loops, best of 3: 7.46 ms per loop


In [105]:
%%timeit
fisher_exact([[163, 9994], [9204, 44685719]])


100 loops, best of 3: 3.92 ms per loop


In [100]:
oddsratio = 163 * 44685719 / float(9994 * 9204)
print oddsratio

79.1845402439


In [56]:
### Something is wrong here
most_frequent_df = []
for pert_id, ingredient_id in d_pert_id_ingredient_id.items():
    dx_counts = rx_dx_counts_normed[ingredient_id]
    rx_counts = rx_rx_counts_normed[ingredient_id]
    rec = {
        'pert_id': pert_id,
        'most_frequent_dx': d_id_diagnosis[np.argmax(dx_counts)],
        'most_frequent_rx': d_id_ingredient[np.argmax(rx_counts)],
    }
    most_frequent_df.append(rec)

most_frequent_df = pd.DataFrame(most_frequent_df).set_index('pert_id')
print most_frequent_df.shape
most_frequent_df.head(10)

(1564, 2)


Unnamed: 0_level_0,most_frequent_dx,most_frequent_rx
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1
BRD-K84281997,"Tracheostomy complication, unspecified",VITAMIN K 1
BRD-A17883755,"Other and unspecified cord entanglement, with ...",ZOLEDRONIC ACID
BRD-K15916496,"Subacute myeloid leukemia, in relapse",SORBITAN
BRD-A31159102,"Subacute leukemia of unspecified cell type, in...",ACAI BERRY EXTRACT
BRD-K72222507,Erysipelothrix infection,PROPYLENE
BRD-K29905972,Malignant carcinoid tumor of the kidney,ZOLEDRONIC ACID
BRD-K92049597,Circumscribed labyrinthitis,PUMPKIN SEED OIL
BRD-K47029922,Arthropathy associated with Reiter's disease a...,PINDOLOL
BRD-K04956647,"Retinopathy of prematurity, stage 4",QUERCETIN
BRD-A70461345,Of bypass graft of the extremities,BRETYLIUM


In [74]:
print d_id_ingredient[1493]
print d_pert_id_ingredient_id['BRD-K92049597']

TRIAMTERENE
1493


In [57]:
engine = create_engine('mysql://euclid:elements@amp.pharm.mssm.edu:3306/euclid4?charset=utf8')
most_frequent_df.to_sql('most_frequent_dx_rx', engine, 
                        if_exists='replace', 
                        dtype={'pert_id': NVARCHAR(20)}
                       )

## Compute oddsratio and p-value matrices for `rx_rx_counts` and `rx_dx_counts`

In [87]:
print rx_rx_counts.shape, rx_dx_counts.shape
print rx_total_counts_array.shape, dx_total_counts_array.shape

(1588, 1588) (1588, 14585)
(1588,) (14585,)


In [107]:
from itertools import combinations
from scipy.stats import fisher_exact
from joblib import Parallel, delayed

In [90]:
list(combinations(range(4),2))

[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

In [108]:
def compute_fisher(a, b, c, d):
    oddsratio, p = fisher_exact([[a,b], [c,d]])
    return oddsratio, p

In [106]:
rx_rx_counts_or = np.zeros_like(rx_rx_counts, dtype=np.float)
rx_rx_counts_p = np.zeros_like(rx_rx_counts, dtype=np.float)

universe = rx_total_counts_array.sum()
count = 0
for i, j in combinations(range(rx_rx_counts.shape[0]), 2):
    co_count = rx_rx_counts[i, j]
    b = rx_total_counts_array[i]
    c = rx_total_counts_array[j]
    oddsratio, p = fisher_exact([[co_count, b], [c, universe]])
    rx_rx_counts_or[i,j] = oddsratio
    rx_rx_counts_or[j,i] = oddsratio
    
    rx_rx_counts_p[i,j] = p
    rx_rx_counts_p[j,i] = p
    count += 1
    if count % 10000 == 0:
        print count

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000


KeyboardInterrupt: 

In [109]:
universe = rx_total_counts_array.sum()

or_p_s = Parallel(n_jobs=7, verbose=10, backend='multiprocessing')(delayed(compute_fisher)\
                            (rx_rx_counts[i, j], rx_total_counts_array[i], rx_total_counts_array[j], universe)\
                           for i, j in combinations(range(rx_rx_counts.shape[0]), 2))

[Parallel(n_jobs=7)]: Batch computation too fast (0.0064s.) Setting batch_size=62.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Batch computation too fast (0.0407s.) Setting batch_size=608.
[Parallel(n_jobs=7)]: Done 262 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 820 tasks      | elapsed:    0.2s
[Parallel(n_jobs=7)]: Batch computation too slow (2.0056s.) Setting batch_size=304.
[Parallel(n_jobs=7)]: Done 5746 tasks      | elapsed:    4.6s
[Parallel(n_jobs=7)]: Batch computation too slow (2.7751s.) Setting batch_size=152.
[Parallel(n_jobs=7)]: Done 12130 tasks      | elapsed:    6.6s
[Parallel(n_jobs=7)]: Batch computation too slow (2.2442s.) Setting batch_size=76.
[Parallel(n_jobs=7)]: Done 15474 tasks      | elapsed:    8.8s
[Parallel(n_jobs=7)]: Done 17526 tasks      | elapsed:    9.9s
[Parallel(n_jobs=7)]: Done 18514 tasks      | elapsed:   10.8s
[Parallel(n_jobs=7)]: Do

[Parallel(n_jobs=7)]: Batch computation too fast (0.1080s.) Setting batch_size=192.
[Parallel(n_jobs=7)]: Batch computation too slow (2.0436s.) Setting batch_size=96.
[Parallel(n_jobs=7)]: Batch computation too slow (2.2109s.) Setting batch_size=48.
[Parallel(n_jobs=7)]: Done 195104 tasks      | elapsed:  4.0min
[Parallel(n_jobs=7)]: Batch computation too slow (2.1246s.) Setting batch_size=24.
[Parallel(n_jobs=7)]: Done 199136 tasks      | elapsed:  4.2min
[Parallel(n_jobs=7)]: Done 201080 tasks      | elapsed:  4.2min
[Parallel(n_jobs=7)]: Batch computation too slow (2.0201s.) Setting batch_size=12.
[Parallel(n_jobs=7)]: Done 202388 tasks      | elapsed:  4.3min
[Parallel(n_jobs=7)]: Done 203360 tasks      | elapsed:  4.4min
[Parallel(n_jobs=7)]: Batch computation too fast (0.1893s.) Setting batch_size=24.
[Parallel(n_jobs=7)]: Batch computation too fast (0.0613s.) Setting batch_size=156.
[Parallel(n_jobs=7)]: Batch computation too slow (2.0052s.) Setting batch_size=78.
[Parallel(n_jo

[Parallel(n_jobs=7)]: Batch computation too slow (2.0290s.) Setting batch_size=22.
[Parallel(n_jobs=7)]: Done 489540 tasks      | elapsed:  9.7min
[Parallel(n_jobs=7)]: Batch computation too fast (0.1933s.) Setting batch_size=44.
[Parallel(n_jobs=7)]: Batch computation too fast (0.1245s.) Setting batch_size=140.
[Parallel(n_jobs=7)]: Batch computation too slow (2.0783s.) Setting batch_size=70.
[Parallel(n_jobs=7)]: Done 500114 tasks      | elapsed:  9.8min
[Parallel(n_jobs=7)]: Done 509144 tasks      | elapsed: 10.0min
[Parallel(n_jobs=7)]: Done 518174 tasks      | elapsed: 10.1min
[Parallel(n_jobs=7)]: Done 527344 tasks      | elapsed: 10.2min
[Parallel(n_jobs=7)]: Done 536514 tasks      | elapsed: 10.4min
[Parallel(n_jobs=7)]: Done 545824 tasks      | elapsed: 10.6min
[Parallel(n_jobs=7)]: Done 555134 tasks      | elapsed: 10.7min
[Parallel(n_jobs=7)]: Done 564584 tasks      | elapsed: 10.9min
[Parallel(n_jobs=7)]: Done 574034 tasks      | elapsed: 11.0min
[Parallel(n_jobs=7)]: Batch

[Parallel(n_jobs=7)]: Done 801988 tasks      | elapsed: 17.0min
[Parallel(n_jobs=7)]: Batch computation too slow (2.0076s.) Setting batch_size=77.
[Parallel(n_jobs=7)]: Batch computation too slow (2.1722s.) Setting batch_size=38.
[Parallel(n_jobs=7)]: Done 816740 tasks      | elapsed: 17.2min
[Parallel(n_jobs=7)]: Batch computation too fast (0.1901s.) Setting batch_size=78.
[Parallel(n_jobs=7)]: Done 825805 tasks      | elapsed: 17.4min
[Parallel(n_jobs=7)]: Batch computation too slow (2.0992s.) Setting batch_size=39.
[Parallel(n_jobs=7)]: Batch computation too fast (0.1907s.) Setting batch_size=80.
[Parallel(n_jobs=7)]: Done 838328 tasks      | elapsed: 17.6min
[Parallel(n_jobs=7)]: Batch computation too slow (2.0115s.) Setting batch_size=40.
[Parallel(n_jobs=7)]: Batch computation too slow (6.5947s.) Setting batch_size=20.
[Parallel(n_jobs=7)]: Batch computation too slow (8.3533s.) Setting batch_size=10.
[Parallel(n_jobs=7)]: Batch computation too fast (0.1962s.) Setting batch_size=2

[Parallel(n_jobs=7)]: Done 1131252 tasks      | elapsed: 23.9min
[Parallel(n_jobs=7)]: Batch computation too slow (2.1843s.) Setting batch_size=58.
[Parallel(n_jobs=7)]: Done 1144882 tasks      | elapsed: 24.2min
[Parallel(n_jobs=7)]: Done 1156192 tasks      | elapsed: 24.3min
[Parallel(n_jobs=7)]: Done 1167618 tasks      | elapsed: 24.4min
[Parallel(n_jobs=7)]: Batch computation too fast (0.1979s.) Setting batch_size=116.
[Parallel(n_jobs=7)]: Batch computation too slow (2.2920s.) Setting batch_size=58.
[Parallel(n_jobs=7)]: Done 1184496 tasks      | elapsed: 24.7min
[Parallel(n_jobs=7)]: Batch computation too slow (2.1420s.) Setting batch_size=29.
[Parallel(n_jobs=7)]: Batch computation too fast (0.1922s.) Setting batch_size=60.
[Parallel(n_jobs=7)]: Done 1191079 tasks      | elapsed: 24.9min
[Parallel(n_jobs=7)]: Done 1202771 tasks      | elapsed: 25.1min
[Parallel(n_jobs=7)]: Done 1214831 tasks      | elapsed: 25.3min
[Parallel(n_jobs=7)]: Done 1226891 tasks      | elapsed: 25.4min

In [110]:
rx_rx_counts_or = np.zeros_like(rx_rx_counts, dtype=np.float)
rx_rx_counts_p = np.zeros_like(rx_rx_counts, dtype=np.float)

c = 0
for i, j in combinations(range(rx_rx_counts.shape[0]), 2):
    oddsratio, p = or_p_s[c]
    rx_rx_counts_or[i,j] = oddsratio
    rx_rx_counts_or[j,i] = oddsratio
    
    rx_rx_counts_p[i,j] = p
    rx_rx_counts_p[j,i] = p
    c += 1

In [111]:
# Save those mat
np.savetxt('../data/rx_rx_oddsratio.txt', rx_rx_counts_or)
np.savetxt('../data/rx_rx_Fisherp.txt', rx_rx_counts_p)

In [112]:
print len(d_id_diagnosis), min(d_id_diagnosis.keys()), max(d_id_diagnosis.keys())
print rx_dx_counts.shape

14353 0 14584
(1588, 14585)


In [113]:
print rx_dx_counts.sum()
print dx_total_counts_array.sum(), rx_total_counts_array.sum()

151239067
42825357 44685719


In [115]:
from itertools import product
print len(list(product( d_id_ingredient.keys(), d_id_diagnosis.keys())))

22792564


In [116]:
universe = dx_total_counts_array.sum() + rx_total_counts_array.sum()

or_p_s = Parallel(n_jobs=8, verbose=10, backend='multiprocessing')(delayed(compute_fisher)\
                            (rx_dx_counts[i, j], rx_total_counts_array[i], dx_total_counts_array[j], universe)\
                           for i, j in product( d_id_ingredient.keys(), d_id_diagnosis.keys() ))

[Parallel(n_jobs=8)]: Batch computation too fast (0.0077s.) Setting batch_size=52.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Batch computation too fast (0.0666s.) Setting batch_size=312.
[Parallel(n_jobs=8)]: Done 484 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1472 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 4904 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 8336 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 12392 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 16448 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 21128 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 25808 tasks      | elapsed:    3.2s
[Parallel(n_jobs=8)]: Done 31112 tasks      | elapsed:    5.1s
[Parallel(n_jobs=8)]: Done 36416 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Don

[Parallel(n_jobs=8)]: Done 971586 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done 992651 tasks      | elapsed:  3.5min
[Parallel(n_jobs=8)]: Done 1014015 tasks      | elapsed:  3.6min
[Parallel(n_jobs=8)]: Done 1035771 tasks      | elapsed:  3.7min
[Parallel(n_jobs=8)]: Done 1057527 tasks      | elapsed:  3.7min
[Parallel(n_jobs=8)]: Batch computation too slow (2.1379s.) Setting batch_size=98.
[Parallel(n_jobs=8)]: Done 1073011 tasks      | elapsed:  3.8min
[Parallel(n_jobs=8)]: Done 1084085 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1991s.) Setting batch_size=196.
[Parallel(n_jobs=8)]: Done 1104567 tasks      | elapsed:  4.0min
[Parallel(n_jobs=8)]: Batch computation too slow (2.1237s.) Setting batch_size=98.
[Parallel(n_jobs=8)]: Done 1118091 tasks      | elapsed:  4.1min
[Parallel(n_jobs=8)]: Done 1129557 tasks      | elapsed:  4.1min
[Parallel(n_jobs=8)]: Done 1141023 tasks      | elapsed:  4.2min
[Parallel(n_jobs=8)]: Batch computati

[Parallel(n_jobs=8)]: Done 3027358 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done 3054268 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1948s.) Setting batch_size=282.
[Parallel(n_jobs=8)]: Done 3090526 tasks      | elapsed: 11.4min
[Parallel(n_jobs=8)]: Batch computation too slow (2.2167s.) Setting batch_size=141.
[Parallel(n_jobs=8)]: Done 3133390 tasks      | elapsed: 11.6min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1848s.) Setting batch_size=304.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0302s.) Setting batch_size=152.
[Parallel(n_jobs=8)]: Done 3166058 tasks      | elapsed: 11.7min
[Parallel(n_jobs=8)]: Batch computation too slow (2.1468s.) Setting batch_size=76.
[Parallel(n_jobs=8)]: Done 3192962 tasks      | elapsed: 11.9min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1976s.) Setting batch_size=152.
[Parallel(n_jobs=8)]: Batch computation too slow (2.6266s.) Setting batch_size=76.
[Parallel(n_jobs=8)]: Ba

[Parallel(n_jobs=8)]: Done 6002407 tasks      | elapsed: 21.6min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0271s.) Setting batch_size=105.
[Parallel(n_jobs=8)]: Done 6053017 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1941s.) Setting batch_size=216.
[Parallel(n_jobs=8)]: Done 6106219 tasks      | elapsed: 21.9min
[Parallel(n_jobs=8)]: Done 6165187 tasks      | elapsed: 22.1min
[Parallel(n_jobs=8)]: Done 6224155 tasks      | elapsed: 22.3min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0516s.) Setting batch_size=108.
[Parallel(n_jobs=8)]: Done 6259039 tasks      | elapsed: 22.5min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1997s.) Setting batch_size=216.
[Parallel(n_jobs=8)]: Done 6312067 tasks      | elapsed: 22.7min
[Parallel(n_jobs=8)]: Done 6371899 tasks      | elapsed: 22.8min
[Parallel(n_jobs=8)]: Batch computation too slow (2.2039s.) Setting batch_size=108.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1993s.) 

[Parallel(n_jobs=8)]: Batch computation too slow (2.0615s.) Setting batch_size=84.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1979s.) Setting batch_size=168.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0595s.) Setting batch_size=84.
[Parallel(n_jobs=8)]: Done 8679668 tasks      | elapsed: 32.9min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1962s.) Setting batch_size=170.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0077s.) Setting batch_size=85.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1994s.) Setting batch_size=170.
[Parallel(n_jobs=8)]: Done 8728672 tasks      | elapsed: 33.1min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0197s.) Setting batch_size=85.
[Parallel(n_jobs=8)]: Done 8772192 tasks      | elapsed: 33.3min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1976s.) Setting batch_size=172.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0128s.) Setting batch_size=86.
[Parallel(n_jobs=8)]: Done 8817733 tasks      | elapse

[Parallel(n_jobs=8)]: Done 11086180 tasks      | elapsed: 43.2min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0513s.) Setting batch_size=68.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1950s.) Setting batch_size=138.
[Parallel(n_jobs=8)]: Done 11133001 tasks      | elapsed: 43.5min
[Parallel(n_jobs=8)]: Done 11185303 tasks      | elapsed: 43.6min
[Parallel(n_jobs=8)]: Done 11237881 tasks      | elapsed: 43.8min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0321s.) Setting batch_size=69.
[Parallel(n_jobs=8)]: Done 11287354 tasks      | elapsed: 44.1min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1995s.) Setting batch_size=138.
[Parallel(n_jobs=8)]: Done 11313850 tasks      | elapsed: 44.3min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1944s.) Setting batch_size=282.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0903s.) Setting batch_size=141.
[Parallel(n_jobs=8)]: Done 11416858 tasks      | elapsed: 44.7min
[Parallel(n_jobs=8)]: Batch computat

[Parallel(n_jobs=8)]: Batch computation too slow (2.1016s.) Setting batch_size=106.
[Parallel(n_jobs=8)]: Done 13461657 tasks      | elapsed: 53.5min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1950s.) Setting batch_size=216.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0608s.) Setting batch_size=108.
[Parallel(n_jobs=8)]: Batch computation too slow (2.3893s.) Setting batch_size=54.
[Parallel(n_jobs=8)]: Batch computation too slow (2.9517s.) Setting batch_size=27.
[Parallel(n_jobs=8)]: Done 13522743 tasks      | elapsed: 53.9min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1974s.) Setting batch_size=54.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1958s.) Setting batch_size=110.
[Parallel(n_jobs=8)]: Batch computation too slow (2.0468s.) Setting batch_size=55.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1979s.) Setting batch_size=110.
[Parallel(n_jobs=8)]: Done 13547085 tasks      | elapsed: 54.2min
[Parallel(n_jobs=8)]: Batch computation too slow (

[Parallel(n_jobs=8)]: Batch computation too slow (2.0494s.) Setting batch_size=121.
[Parallel(n_jobs=8)]: Done 15917338 tasks      | elapsed: 64.1min
[Parallel(n_jobs=8)]: Done 15972877 tasks      | elapsed: 64.2min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1920s.) Setting batch_size=252.
[Parallel(n_jobs=8)]: Done 16043985 tasks      | elapsed: 64.5min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0025s.) Setting batch_size=126.
[Parallel(n_jobs=8)]: Done 16127901 tasks      | elapsed: 64.8min
[Parallel(n_jobs=8)]: Done 16186239 tasks      | elapsed: 65.1min
[Parallel(n_jobs=8)]: Done 16244577 tasks      | elapsed: 65.2min
[Parallel(n_jobs=8)]: Done 16303167 tasks      | elapsed: 65.4min
[Parallel(n_jobs=8)]: Batch computation too slow (2.0751s.) Setting batch_size=63.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1976s.) Setting batch_size=126.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1732s.) Setting batch_size=290.
[Parallel(n_jobs=8)]: Done 16407886

[Parallel(n_jobs=8)]: Batch computation too fast (0.1938s.) Setting batch_size=150.
[Parallel(n_jobs=8)]: Done 19276739 tasks      | elapsed: 77.2min
[Parallel(n_jobs=8)]: Batch computation too slow (2.1757s.) Setting batch_size=75.
[Parallel(n_jobs=8)]: Batch computation too slow (2.8416s.) Setting batch_size=37.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1991s.) Setting batch_size=74.
[Parallel(n_jobs=8)]: Done 19325370 tasks      | elapsed: 77.5min
[Parallel(n_jobs=8)]: Batch computation too fast (0.1997s.) Setting batch_size=148.
[Parallel(n_jobs=8)]: Done 19398558 tasks      | elapsed: 77.8min
[Parallel(n_jobs=8)]: Done 19473890 tasks      | elapsed: 78.1min
[Parallel(n_jobs=8)]: Done 19549222 tasks      | elapsed: 78.4min
[Parallel(n_jobs=8)]: Done 19624850 tasks      | elapsed: 78.6min
[Parallel(n_jobs=8)]: Batch computation too slow (2.1237s.) Setting batch_size=74.
[Parallel(n_jobs=8)]: Batch computation too fast (0.1999s.) Setting batch_size=148.
[Parallel(n_jobs=8)]

[Parallel(n_jobs=8)]: Done 22792564 out of 22792564 | elapsed: 91.2min finished


In [118]:
rx_dx_counts_or = np.zeros_like(rx_dx_counts, dtype=np.float)
rx_dx_counts_p = np.zeros_like(rx_dx_counts, dtype=np.float)

c = 0
for i, j in product( d_id_ingredient.keys(), d_id_diagnosis.keys() ):
    oddsratio, p = or_p_s[c]
    rx_dx_counts_or[i,j] = oddsratio    
    rx_dx_counts_p[i,j] = p
    c += 1

In [120]:
rx_dx_counts.shape

(1588, 14585)

In [119]:
# Save those mat
np.savetxt('../data/rx_dx_oddsratio.txt', rx_dx_counts_or)
np.savetxt('../data/rx_dx_Fisherp.txt', rx_dx_counts_p)

# Insert the count matrices to the database

In [117]:
d_id_ingredient = dict(zip(id_ingredients.index, id_ingredients['Ingredient']))
print len(d_id_ingredient)
d_id_diagnosis = dict(zip(id_diagnoses.index, id_diagnoses['diagnosis']))
print len(d_id_diagnosis)

1588
14353


In [68]:
rx_names = np.array([d_id_ingredient.get(i, None) for i in range(rx_rx_counts.shape[0])])
rx_pert_ids = np.array([d_ingredient_id_pert_id.get(i, None) for i in range(rx_rx_counts.shape[0])])
ingredient_ids = np.arange(rx_rx_counts.shape[0])
# print rx_pert_ids

rx_pert_ids_mask = pd.isnull(rx_pert_ids)
rx_names_mask = pd.isnull(rx_names)

print rx_names_mask.sum(), rx_pert_ids_mask.sum()
print rx_names_mask.shape, rx_pert_ids_mask.shape

rx_rx_counts_df = pd.DataFrame(rx_rx_counts, 
                               index=ingredient_ids,
#                                columns=rx_names[~rx_names_mask]
                               columns=ingredient_ids
                              )
print rx_rx_counts_df.shape
rx_rx_counts_df.index.name='ingredient_id'
rx_rx_counts_df.head()

0 657
(1588,) (1588,)
(1588, 1588)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587
ingredient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,3,3,...,0,42,0,0,58,12,1,0,1256,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0


In [70]:
rx_rx_counts_df = pd.melt(rx_rx_counts_df.reset_index(), 
                          id_vars=['ingredient_id'], 
                          value_vars=rx_rx_counts_df.columns.tolist(),
#                           var_name='co_prescribed_drug',
                          var_name='co_prescribed_drug_id',
                          value_name='count',
                         )
print rx_rx_counts_df.shape
rx_rx_counts_df = rx_rx_counts_df.query('count > 10')
print rx_rx_counts_df.shape
rx_rx_counts_df.head()

(2521744, 3)
(296506, 3)


Unnamed: 0,ingredient_id,co_prescribed_drug_id,count
3188,12,2,13
3189,13,2,3187
3190,14,2,13
3191,15,2,15
3195,19,2,170


In [62]:
rx_rx_counts_normed_df = pd.DataFrame(rx_rx_counts_normed, 
                                      index=ingredient_ids,
#                                       columns=rx_names[~rx_names_mask]
                                      columns=ingredient_ids
                                     )
print rx_rx_counts_normed_df.shape
rx_rx_counts_normed_df.index.name='ingredient_id'
rx_rx_counts_normed_df.head()

(1588, 1588)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587
ingredient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002822,0.00155,...,0.0,0.000998,0.0,0.0,0.000763,0.000846,0.000257,0.0,0.006699,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.3e-05,0.0,0.0,0.0,5e-06,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2e-05,0.0


In [63]:
rx_rx_counts_normed_df = pd.melt(rx_rx_counts_normed_df.reset_index(), 
                          id_vars=['ingredient_id'], 
                          value_vars=rx_rx_counts_normed_df.columns.tolist(),
                          var_name='co_prescribed_drug_id',
                          value_name='normed_count',
                         )
print rx_rx_counts_normed_df.shape
print rx_rx_counts_normed_df.shape
rx_rx_counts_normed_df.head()

(2521744, 3)
(2521744, 3)


Unnamed: 0,ingredient_id,co_prescribed_drug_id,normed_count
0,0,0,0.0
1,1,0,0.0
2,2,0,0.0
3,3,0,0.0
4,4,0,0.0


In [64]:
rx_rx_counts_normed_df.loc[rx_rx_counts_df.index[:10]]
# Add the normed_count as a column of the rx_rx_counts_df
rx_rx_counts_df['normed_count'] = rx_rx_counts_normed_df.loc[rx_rx_counts_df.index]['normed_count']
print rx_rx_counts_df.shape
rx_rx_counts_df.head()

(296506, 4)


Unnamed: 0,ingredient_id,co_prescribed_drug_id,count,normed_count
3188,12,2,13,0.000433
3189,13,2,3187,0.106092
3190,14,2,13,0.000433
3191,15,2,15,0.000499
3195,19,2,170,0.005659


In [65]:
rx_rx_counts_df.dtypes

ingredient_id              int64
co_prescribed_drug_id     object
count                      int64
normed_count             float64
dtype: object

In [66]:
rx_rx_counts_df['co_prescribed_drug_id'] = rx_rx_counts_df['co_prescribed_drug_id'].astype(np.int)

In [67]:
engine = create_engine('mysql://euclid:elements@amp.pharm.mssm.edu:3306/euclid4?charset=utf8')
rx_rx_counts_df.to_sql('co_rx', engine, index=False, if_exists='replace')

In [68]:
dx_names = np.array([d_id_diagnosis.get(i, None) for i in range(rx_dx_counts.shape[1])])
# print rx_pert_ids

dx_names_mask = pd.isnull(dx_names)

diagnosis_ids = np.arange(rx_dx_counts.shape[1])[~dx_names_mask]


print dx_names_mask.sum()

rx_dx_counts_df = pd.DataFrame(rx_dx_counts[:, ~dx_names_mask], 
                               index=ingredient_ids,
#                                columns=dx_names[~dx_names_mask]
                               columns=diagnosis_ids
                              )
print rx_dx_counts_df.shape
rx_dx_counts_df.index.name='ingredient_id'
rx_dx_counts_df.head()

273
(1588, 14312)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14575,14576,14577,14578,14579,14580,14581,14582,14583,14584
ingredient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,25,24,49,94,2,2939,12,8,10,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,20,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,21,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
rx_dx_counts_df = pd.melt(rx_dx_counts_df.reset_index(), 
                          id_vars=['ingredient_id'], 
                          value_vars=rx_dx_counts_df.columns.tolist(),
                          var_name='diagnosis_id',
                          value_name='count',
                         )
print rx_dx_counts_df.shape
rx_dx_counts_df = rx_dx_counts_df.query('count > 10')
print rx_dx_counts_df.shape
rx_dx_counts_df.head()

(22727456, 3)
(733240, 3)


Unnamed: 0,ingredient_id,diagnosis_id,count
13,13,0,276
36,36,0,13
77,77,0,24
108,108,0,11
114,114,0,23


In [70]:
rx_dx_counts_normed_df = pd.DataFrame(rx_dx_counts_normed[:, ~dx_names_mask], 
                               index=ingredient_ids,
                               columns=diagnosis_ids
                              )
print rx_dx_counts_normed_df.shape
rx_dx_counts_normed_df.index.name='ingredient_id'
rx_dx_counts_normed_df.head()

(1588, 14312)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14575,14576,14577,14578,14579,14580,14581,14582,14583,14584
ingredient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000986,0.00094,0.001377,0.000533,0.001042,0.001144,0.002122,0.001526,0.000813,0.001812,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.4e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.1e-05,0.0,0.0,1.5e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
rx_dx_counts_normed_df = pd.melt(rx_dx_counts_normed_df.reset_index(), 
                          id_vars=['ingredient_id'], 
                          value_vars=rx_dx_counts_normed_df.columns.tolist(),
                          var_name='diagnosis_id',
                          value_name='normed_count',
                         )
print rx_dx_counts_normed_df.shape

print rx_dx_counts_normed_df.shape
rx_dx_counts_normed_df.head()

(22727456, 3)
(22727456, 3)


Unnamed: 0,ingredient_id,diagnosis_id,normed_count
0,0,0,0.0
1,1,0,0.0
2,2,0,0.000986
3,3,0,0.0
4,4,0,0.0


In [72]:
# Add the normed_count as a column of the rx_rx_counts_df
rx_dx_counts_df['normed_count'] = rx_dx_counts_normed_df.loc[rx_dx_counts_df.index]['normed_count']
print rx_dx_counts_df.shape
rx_dx_counts_df.head()

(733240, 4)


Unnamed: 0,ingredient_id,diagnosis_id,count,normed_count
13,13,0,276,0.272189
36,36,0,13,0.012821
77,77,0,24,0.023669
108,108,0,11,0.010848
114,114,0,23,0.022682


In [73]:
rx_dx_counts_df.dtypes

ingredient_id      int64
diagnosis_id      object
count              int64
normed_count     float64
dtype: object

In [74]:
rx_dx_counts_df['diagnosis_id'] = rx_dx_counts_df['diagnosis_id'].astype(np.int)

In [75]:
rx_dx_counts_df.to_sql('co_dx', engine, index=False, if_exists='replace', 
                      )

In [76]:
del rx_dx_counts, rx_dx_counts_df, rx_dx_counts_normed, rx_dx_counts_normed_df
del rx_rx_counts, rx_rx_counts_df, rx_rx_counts_normed, rx_rx_counts_normed_df

# Transfer the Rx age KDE data

In [77]:
rx_age_kde = pd.read_sql_table('rx_age_kde', engine_laforge)
print rx_age_kde.shape
rx_age_kde.head()

(69250, 3)


Unnamed: 0,age_years,density,ingredient_id
0,7.682409,0.012317,1.0
1,9.103074,0.012775,1.0
2,10.52374,0.013165,1.0
3,11.944405,0.013484,1.0
4,13.36507,0.013729,1.0


In [78]:
rx_age_kde.dtypes

age_years        float64
density          float64
ingredient_id    float64
dtype: object

In [79]:
rx_age_kde['ingredient_id'] = rx_age_kde['ingredient_id'].astype(np.int)

In [80]:
# rx_age_kde['pert_id'] = [d_ingredient_id_pert_id.get(iid, None) for iid in rx_age_kde['ingredient_id']]
rx_age_kde.count()

age_years        69250
density          69250
ingredient_id    69250
dtype: int64

In [81]:
# rx_age_kde = rx_age_kde.drop(['ingredient_id'], axis=1).dropna()
print rx_age_kde.shape

(69250, 3)


In [82]:
rx_age_kde.dtypes

age_years        float64
density          float64
ingredient_id      int64
dtype: object

In [83]:
# max(rx_age_kde['pert_id'].map(len))

In [84]:
rx_age_kde.to_sql('rx_age_kde', engine, 
                  index=False, 
                  if_exists='replace',
                  dtype={'ingredient_id': Integer, 'age_years': Float, 'density': Float})

# Load the metadata_df used for the L1000FWD app

In [74]:
meta_df_l1000fwd = pd.read_csv('../data/metadata-full-anno.tsv', sep='\t').set_index('sig_id')
print meta_df_l1000fwd.shape
meta_df_l1000fwd.head()

(89419, 7)


Unnamed: 0_level_0,cell,dose,pert_id,perturbation,pvalue,time,drug_class
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CPC015_MCF7_6H:BRD-A00546892:10.0,MCF7,10.0,BRD-A00546892,biperiden,0.0143,6,unannotated
CPC004_VCAP_6H:BRD-A00546892:10.0,VCAP,10.0,BRD-A00546892,biperiden,0.2056,6,unannotated
CPC015_ASC_24H:BRD-A00546892:10.0,ASC,10.0,BRD-A00546892,biperiden,0.2475,24,unannotated
CPC004_VCAP_24H:BRD-A00546892:10.0,VCAP,10.0,BRD-A00546892,biperiden,0.3039,24,unannotated
CPC015_PHH_24H:BRD-A00546892:10.0,PHH,10.0,BRD-A00546892,biperiden,0.3584,24,unannotated


In [75]:
meta_df_l1000fwd = meta_df_l1000fwd.merge(repo_df_by_pert.drop(['Name','Target'], axis=1), 
                                          left_on='pert_id', 
                                          right_index=True, how='left')
print meta_df_l1000fwd.shape
meta_df_l1000fwd.head()

(89419, 9)


Unnamed: 0_level_0,cell,dose,pert_id,perturbation,pvalue,time,drug_class,MOA,Phase
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CPC015_MCF7_6H:BRD-A00546892:10.0,MCF7,10.0,BRD-A00546892,biperiden,0.0143,6,unannotated,acetylcholine receptor antagonist,Launched
CPC004_VCAP_6H:BRD-A00546892:10.0,VCAP,10.0,BRD-A00546892,biperiden,0.2056,6,unannotated,acetylcholine receptor antagonist,Launched
CPC015_ASC_24H:BRD-A00546892:10.0,ASC,10.0,BRD-A00546892,biperiden,0.2475,24,unannotated,acetylcholine receptor antagonist,Launched
CPC004_VCAP_24H:BRD-A00546892:10.0,VCAP,10.0,BRD-A00546892,biperiden,0.3039,24,unannotated,acetylcholine receptor antagonist,Launched
CPC015_PHH_24H:BRD-A00546892:10.0,PHH,10.0,BRD-A00546892,biperiden,0.3584,24,unannotated,acetylcholine receptor antagonist,Launched


In [76]:
meta_df_l1000fwd = meta_df_l1000fwd.merge(most_frequent_df, 
                       left_on='pert_id',
                       right_index=True,
                       how='left'
                      )
print meta_df_l1000fwd.shape
meta_df_l1000fwd.head()

(89419, 11)


Unnamed: 0_level_0,cell,dose,pert_id,perturbation,pvalue,time,drug_class,MOA,Phase,most_frequent_dx,most_frequent_rx
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CPC015_MCF7_6H:BRD-A00546892:10.0,MCF7,10.0,BRD-A00546892,biperiden,0.0143,6,unannotated,acetylcholine receptor antagonist,Launched,Fitting and adjustment of other devices relate...,SELEGILINE
CPC004_VCAP_6H:BRD-A00546892:10.0,VCAP,10.0,BRD-A00546892,biperiden,0.2056,6,unannotated,acetylcholine receptor antagonist,Launched,Fitting and adjustment of other devices relate...,SELEGILINE
CPC015_ASC_24H:BRD-A00546892:10.0,ASC,10.0,BRD-A00546892,biperiden,0.2475,24,unannotated,acetylcholine receptor antagonist,Launched,Fitting and adjustment of other devices relate...,SELEGILINE
CPC004_VCAP_24H:BRD-A00546892:10.0,VCAP,10.0,BRD-A00546892,biperiden,0.3039,24,unannotated,acetylcholine receptor antagonist,Launched,Fitting and adjustment of other devices relate...,SELEGILINE
CPC015_PHH_24H:BRD-A00546892:10.0,PHH,10.0,BRD-A00546892,biperiden,0.3584,24,unannotated,acetylcholine receptor antagonist,Launched,Fitting and adjustment of other devices relate...,SELEGILINE


In [77]:
meta_df_l1000fwd.to_csv('../data/metadata-full-anno-with-EMR.tsv', sep='\t')