## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import time
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from scipy import stats, integrate
from sklearn.metrics import pairwise_distances
import networkx as nx

## Load FPS and Resouces Files

In [2]:
fps = []
files = []

for filename in os.listdir('RDKit_fps/'):
    fps.append(filename[:-24])
    df = pd.read_table('RDKit_fps/'+ filename)
    df[df.columns[0]] = df[df.columns[0]].astype(str)
    files.append(df.set_index(df.columns[0]))

fpsfiles_dict=dict(zip(fps, files)) 
del fpsfiles_dict['']
fps.remove('')

In [51]:
fps = sorted(fps)
fps[:2]

['AtomPair', 'Avalon']

In [4]:
names = []
files = []

for filename in os.listdir('All_bmat/'):
    names.append(filename[:-12])
    df = pd.read_table('All_bmat/'+ filename)
    files.append(df.set_index(df.columns[0]))

namefiles_dict=dict(zip(names, files))   
del namefiles_dict['']
names.remove('')

In [52]:
names[:2]

['DrugBank_Targets', 'PharmagKB_SE']

In [24]:
name_pcid = pd.read_csv('Input/L1000_Drugs_metadata.csv')

name_pcid_dict = {}
for index, row in name_pcid.iterrows():
    name_pcid_dict[row.loc['pubchem_cid']] = row.loc['pert_iname']
len(name_pcid_dict)

20163

## Functions for creating the network

In [6]:
def pairwise_dis(df, metric):
    array_matrix = metrics.pairwise_distances(df, metric = metric)
    return array_matrix

In [9]:
def compute_adjcency_mat(X, metric='euclidean'):
    '''X is a samples by features matrix.'''
    pdist = pairwise_distances(X, metric=metric)
    adj_mat = 1 - pdist / pdist.max()
    # remove 1's on the diagonal
    adj_mat -= np.eye(X.shape[0])
    return adj_mat

def create_graph_by_threshold(adj_mat, percentile):
    '''This function convert an adjacency matrix to a Graph object 
    by keeping the drug-drug connections in top percentile.
    '''
    triu_idx = np.tril_indices(adj_mat.shape[0], 1)
    thresold = np.percentile(adj_mat[triu_idx], percentile)
    adj_mat_ = adj_mat.copy()
    adj_mat_[adj_mat<thresold] = 0
    G = nx.from_numpy_matrix(adj_mat_)
    return G

## Create a network for each drug with each resource

In [49]:
count = 0

for name in names:
    for fp in fps:
        # Load datasets
        X1 = fpsfiles_dict[fp] # Dataset1, np.array, e.g. drugs by fingerprints
        X2 = namefiles_dict[name]
        X2= X2.T # Dataset2, np.array, e.g. drugs by genes
        # # make sure the rows of X1 and X2 are the drugs in the same order

        shared_drugs = sorted(list(set(X1.index) & set(X2.index)))
        for i in shared_drugs:
            if i not in list(name_pcid_dict.keys()):
                shared_drugs.remove(i)
        X1 = X1.loc[shared_drugs]
        X2 = X2.loc[shared_drugs]
        
        # distance metric
        A1 = compute_adjcency_mat(X1,'manhattan')
        A2 = compute_adjcency_mat(X2, 'cosine')
        # Convert to Graphs
        percentile = 99.7 # try adjust this param 
        G1 = create_graph_by_threshold(A1, percentile)
#         print(G1.number_of_nodes(), G1.number_of_edges())

        G2 = create_graph_by_threshold(A2, percentile)
#         print(G2.number_of_nodes(), G2.number_of_edges())

        G12 = nx.intersection(G1, G2)
        nodes_wo_edges = [n for n, k in G12.degree() if k == 0]
        G12.remove_nodes_from(nodes_wo_edges)
#         print(G12.number_of_nodes(), G12.number_of_edges())
    
        G12 = nx.relabel_nodes(G12, dict(zip(range(len(shared_drugs)), shared_drugs)))
        G12 = nx.relabel_nodes(G12, name_pcid_dict)
        # write to a .gml file for visualization in Cytoscape
        filename = 'Networks/' + fp + '_' + name + '_intersection_network.gml'
        nx.write_gml(G12, filename)

## Create networkswith just L1000 signature data

In [None]:
count = 0

for fp in fps:

    # Load datasets
    X1 = fpsfiles_dict[fp] # Dataset1, np.array, e.g. drugs by fingerprints
    X2 = namefiles_dict['L1000_sig_new']
    X2= X2.T # Dataset2, np.array, e.g. drugs by genes
    # # make sure the rows of X1 and X2 are the drugs in the same order
#         drugs = [...] # a list of drug IDs/names in X1 and X2

    shared_drugs = sorted(list(set(X1.index) & set(X2.index)))
    for i in shared_drugs:
        if i not in list(name_pcid_dict.keys()):
            shared_drugs.remove(i)
    X1 = X1.loc[shared_drugs]
    X2 = X2.loc[shared_drugs]

    # distance metric
    A1 = compute_adjcency_mat(X1,'manhattan')
    A2 = compute_adjcency_mat(X2, 'cosine')
    # Convert to Graphs
    percentile = 90 # try adjust this param 
    G1 = create_graph_by_threshold(A1, percentile)
#   print(G1.number_of_nodes(), G1.number_of_edges())

    G2 = create_graph_by_threshold(A2, percentile)
#   print(G2.number_of_nodes(), G2.number_of_edges())

    G12 = nx.intersection(G1, G2)
    nodes_wo_edges = [n for n, k in G12.degree() if k == 0]
    G12.remove_nodes_from(nodes_wo_edges)
    print(G12.number_of_nodes(), G12.number_of_edges())


    # label the nodes with drug names/ID
    G12 = nx.relabel_nodes(G12, dict(zip(range(len(shared_drugs)), shared_drugs)))
    G12 = nx.relabel_nodes(G12, name_pcid_dict)
    # write to a .gml file for visualization in Cytoscape
    filename = 'Networks/' + fp + '_' + 'L1000_sig_new' + '_intersection_network.gml'
    nx.write_gml(G12, filename)
