# DSigDB FDA Approved Drugs

Author: Moshe Silverstein   
Date: 05-07-2018  
Data Source Home: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/  
Data Source Download: http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/download.html  

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

In [3]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [4]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/utility_functions.py'>

# Versions Of Modules In Use

In [5]:
%load_ext version_information
%version_information numpy, pandas, clustergrammer_widget, seaborn 

Software,Version
Python,3.5.5 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,6.3.1
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.2
pandas,0.22.0
clustergrammer_widget,1.9.0
seaborn,0.8.1
Mon May 07 15:32:04 2018 EDT,Mon May 07 15:32:04 2018 EDT


# Path to Output Files

In [6]:
path = '/Users/moshesilverstein/Documents/Harmonizome/DSigDB/Output/'

# Load Data

In [7]:
data = pd.read_csv('Input/DSigDB_All_detailed.txt', sep='\t')

In [8]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
0,citric acid,ABHD5,IC50=3.545(uM),D1 PubChem
1,citric acid,PLIN5,IC50=3.545(uM),D1 PubChem
2,citric acid,PLIN1,IC50=3.708(uM),D1 PubChem
3,citric acid,ABHD5,IC50=5.632(uM),D1 PubChem
4,citric acid,PLIN5,IC50=5.632(uM),D1 PubChem


In [9]:
data.shape

(688782, 4)

# Get FDA Approved Drug Data

In [10]:
lst = []

for index in data.index:
    if type(data.loc[index, 'Source']) == str:
        if 'D1' in data.loc[index, 'Source']:
            lst.append(index)
            
data = data.loc[lst, :]

In [11]:
data.head()

Unnamed: 0,Drug,Gene,Type,Source
0,citric acid,ABHD5,IC50=3.545(uM),D1 PubChem
1,citric acid,PLIN5,IC50=3.545(uM),D1 PubChem
2,citric acid,PLIN1,IC50=3.708(uM),D1 PubChem
3,citric acid,ABHD5,IC50=5.632(uM),D1 PubChem
4,citric acid,PLIN5,IC50=5.632(uM),D1 PubChem


In [12]:
data.shape

(23882, 4)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
data.set_index('Gene', inplace=True)

In [14]:
uf.mapgenesymbols(data)

Progeres: 99%  23666 Out of 23882   

In [15]:
data.shape

# Drop Duplicates

In [16]:
data.reset_index(inplace=True)

In [17]:
data.drop_duplicates(subset=['Gene', 'Drug'], inplace=True)

In [18]:
data.shape

# Create Binary Matrix

In [19]:
binary_matrix = uf.createBinaryMatrix(data[['Gene', 'Drug']])

Progeres: 100%  1279 Out of 1279   

In [20]:
lst = [x.upper() for x in binary_matrix.columns]

binary_matrix.columns = lst

In [21]:
binary_matrix.head()

Unnamed: 0,GALANTHAMINE,CYCLIZINE,SULFANILAMIDE,ETHOTOIN,PREDNISOLONE,LAPATINIB DITOSYLATE HYDRATE,RETINOL,CARMUSTINE,EMTRICITABINE,LAPATINIB,...,DESVENLAFAXINE,METHOTREXATE,PARICALCITOL,LAMOTRIGINE,TINIDAZOLE,ADAPALENE,GUANFACINE HYDROCHLORIDE,GABAPENTIN,AMLODIPINE,TOLMETIN SODIUM
CSNK1G3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DRD5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KCNC3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
APAF1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RORC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
binary_matrix.shape

(1279, 1205)

# Save Binary Matrix

In [23]:
filename = path+'dsigdb_fda_appvd_drugs_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [24]:
name = 'dsigdb_fda_appvd_drugs_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1205 Out of 1205   

# Create Attribute Library

In [26]:
name = 'dsigdb_fda_appvd_drugs_attribute_set'

In [27]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  1279 Out of 1279   

# Create Gene Similarity Matrix

In [28]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [29]:
gene_similarity_matix.head()

Unnamed: 0,CSNK1G3,DRD5,KCNC3,APAF1,RORC,KCNH1,PIP4K2C,MERTK,PTGER2,TBXAS1,...,KCNB2,NR1I3,CLCN2,NR5A1,SELE,P2RX4,DRD2,YWHAG,MELK,ALPI
,,,,,,,,,,,,,,,,,,,,,
CSNK1G3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
DRD5,0.0,1.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09434,0.0,0.0,0.0
KCNC3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
APAF1,0.0,0.030303,0.0,1.0,0.057692,0.0,0.0,0.0,0.0,0.025641,...,0.0,0.0,0.0,0.022727,0.04,0.0,0.056911,0.076923,0.0,0.041667
RORC,0.0,0.0,0.0,0.057692,1.0,0.0,0.0,0.0,0.042553,0.068182,...,0.0,0.029412,0.0,0.106383,0.0,0.0,0.014815,0.029412,0.0,0.0


# Save Gene Similarity Matrix

In [30]:
filename = path+'dsigdb_fda_appvd_drugs_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [31]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [32]:
attribute_similarity_matix.head()

Unnamed: 0,GALANTHAMINE,CYCLIZINE,SULFANILAMIDE,ETHOTOIN,PREDNISOLONE,LAPATINIB DITOSYLATE HYDRATE,RETINOL,CARMUSTINE,EMTRICITABINE,LAPATINIB,...,DESVENLAFAXINE,METHOTREXATE,PARICALCITOL,LAMOTRIGINE,TINIDAZOLE,ADAPALENE,GUANFACINE HYDROCHLORIDE,GABAPENTIN,AMLODIPINE,TOLMETIN SODIUM
,,,,,,,,,,,,,,,,,,,,,
GALANTHAMINE,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CYCLIZINE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,...,0.0,0.022727,0.0,0.034483,0.0,0.0,0.055556,0.0,0.086957,0.0
SULFANILAMIDE,0.0,0.0,1.0,0.0,0.0,0.0,0.032258,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0
ETHOTOIN,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.625,0.0,0.0,0.0,0.0,0.0,0.0
PREDNISOLONE,0.0,0.0,0.0,0.0,1.0,0.0,0.130435,0.066667,0.0,0.041667,...,0.0,0.027027,0.0,0.0,0.0,0.125,0.090909,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [33]:
filename = path+'dsigdb_fda_appvd_drugs_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# HeatMap (clustergrammer) of Similarity Matrix

In [34]:
# net.load_df(attribute_similarity_matix.iloc[0:100,0:100].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

# Create Gene List

In [35]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  1279 Out of 1279   

In [36]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,CSNK1G3,1456
1,DRD5,1816
2,KCNC3,3748
3,APAF1,317
4,RORC,6097


In [37]:
gene_list.shape

(1279, 2)

# Save Gene List

In [38]:
filename = path+'dsigdb_fda_appvd_drugs_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

### Create Metadata Table

In [39]:
data.head()

Unnamed: 0,Gene,Drug,Type,Source
0,ABHD5,citric acid,IC50=3.545(uM),D1 PubChem
1,PLIN5,citric acid,IC50=3.545(uM),D1 PubChem
2,PLIN1,citric acid,IC50=3.708(uM),D1 PubChem
5,ALDH1A1,citric acid,Potency=12.5893(uM),D1 PubChem
6,NFE2L2,citric acid,Potency=54.4827(uM),D1 PubChem


In [40]:
temp = data[['Drug', 'Type', 'Source']].copy()

In [41]:
lst = [x.upper() for x in temp['Drug']]

temp['Drug'] = lst

In [42]:
temp.set_index('Drug', inplace=True)

In [43]:
source = []

metaData = pd.DataFrame(columns=['Source'], index=binary_matrix.columns)

for index in metaData.index:
    if type(temp.loc[index, 'Source']) == str:
        source.append(temp.loc[index, 'Source'].split(' ')[1])
    else:
        source.append((':').join(set([x.split(' ')[1] for x in temp.loc[index, 'Source'].values.tolist()])))
    
metaData['Source'] = source

metaData.head()

Unnamed: 0,Source
,
GALANTHAMINE,PubChem
CYCLIZINE,PubChem
SULFANILAMIDE,PubChem
ETHOTOIN,PubChem
PREDNISOLONE,PubChem


In [44]:
attribute_list = uf.createAttributeList(binary_matrix, metaData)

Progeres: 100%  1205 Out of 1205   

In [45]:
attribute_list.head()

Unnamed: 0_level_0,Source
Attributes,Unnamed: 1_level_1
GALANTHAMINE,PubChem
CYCLIZINE,PubChem
SULFANILAMIDE,PubChem
ETHOTOIN,PubChem
PREDNISOLONE,PubChem


In [46]:
attribute_list.shape

(1205, 1)

# Save Attribute List

In [47]:
filename = path+'dsigdb_fda_appvd_drugs_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [48]:
name = 'dsigdb_fda_appvd_drugs_attribute_edge_list'

In [49]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  1205 Out of 1205   

 The number of statisticaly relevent gene-attribute associations is: 12571
