In [None]:
!unzip 'drive/MyDrive/DrugOrchestra/repurposing_hub.zip' -d 'drive/MyDrive/DrugOrchestra/'

Archive:  drive/MyDrive/DrugOrchestra/repurposing_hub.zip
   creating: drive/MyDrive/DrugOrchestra/repurposing_hub/
  inflating: drive/MyDrive/DrugOrchestra/__MACOSX/._repurposing_hub  
  inflating: drive/MyDrive/DrugOrchestra/repurposing_hub/data.npy  
  inflating: drive/MyDrive/DrugOrchestra/__MACOSX/repurposing_hub/._data.npy  


In [1]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Collecting torch-scatter
  Downloading https://pytorch-geometric.com/whl/torch-1.9.0%2Bcpu/torch_scatter-2.0.8-cp37-cp37m-linux_x86_64.whl (300 kB)
[K     |████████████████████████████████| 300 kB 2.1 MB/s 
[?25hCollecting torch-sparse
  Downloading https://pytorch-geometric.com/whl/torch-1.9.0%2Bcpu/torch_sparse-0.6.11-cp37-cp37m-linux_x86_64.whl (592 kB)
[K     |████████████████████████████████| 592 kB 4.1 MB/s 
[?25hCollecting torch-cluster
  Downloading https://pytorch-geometric.com/whl/torch-1.9.0%2Bcpu/torch_cluster-1.5.9-cp37-cp37m-linux_x86_64.whl (337 kB)
[K     |████████████████████████████████| 337 kB 47.5 MB/s 
[?25hCollecting torch-spline-conv
  Downloading https://pytorch-geometric.com/whl/torch-1.9.0%2Bcpu/torch_spline_conv-1.2.1-cp37-cp37m-linux_x86_64.whl (138 kB)
[K     |████████████████████████████████| 138 kB 33.4 MB/s 
[?25hCollecting torch-geometric
  Downloading torch_geometric-1.7.2

In [2]:
import numpy as np
import torch
from torch_geometric.data import Data, DataLoader
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.decomposition import PCA

In [3]:
repur = np.load('drive/MyDrive/DrugOrchestra/repurposing_hub/data.npy')
db = np.load('drive/MyDrive/DrugOrchestra/drugbank/data.npy')
pdx = np.load('drive/MyDrive/DrugOrchestra/pdx/data.npy')
print(repur.shape)
print(db.shape)
print(pdx.shape)

(119878, 1101)
(234528, 1101)
(1634, 601)


In [5]:
data = np.concatenate((repur, db))
print(data.shape)

(354406, 1101)


In [6]:
drugs = data[:,:300]
targets = data[:,300:600]
labels = data[:,-1]
print(drugs.shape)
print(labels.shape)
print(targets.shape)

(354406, 300)
(354406,)
(354406, 300)


In [8]:
unique_drugs = np.unique(drugs, axis = 0)
unique_targets = np.unique(targets, axis = 0)
print(unique_drugs.shape)
print(unique_targets.shape)

(9976, 300)
(3284, 300)


In [9]:
def get_ids(arr, start = 0):
  node_to_id = {}
  id_to_node = {}
  id = start
  for i in range(arr.shape[0]):
    key = tuple(arr[i,:])
    if key not in node_to_id:
      node_to_id[key] = id
      id_to_node[id] = key
      id += 1
  return id, node_to_id, id_to_node

In [10]:
count_drugs, drug_to_id, id_to_drug = get_ids(drugs)
count_target, target_to_id, id_to_target = get_ids(targets, count_drugs)
print(count_drugs)
print(count_target)

9976
13260


In [11]:
nodes = []
for i in range(count_drugs):
  nodes.append(list(id_to_drug[i]))
for i in range(count_drugs, count_target):
  nodes.append(list(id_to_target[i]))

nodes = np.array(nodes)
print(nodes.shape)
print(nodes)

# nodes = np.concatenate((drugs, targets))
# np.random.shuffle(nodes)
# print(nodes.shape)

(13260, 300)
[[-4.70132500e-02  5.20979580e-02 -7.85940400e-02 ...  2.22580360e-01
   6.03722260e-02 -2.80333940e-01]
 [-2.71171630e-03 -4.28348370e-02 -7.18380300e-02 ...  3.66160410e-03
  -1.33293520e-01  4.93906440e-02]
 [ 2.87913590e-02  1.10205024e-01  8.70622200e-02 ...  1.13011060e-01
  -1.54672890e-02 -7.15056600e-02]
 ...
 [ 4.89370000e+00 -1.24020000e-02 -6.44110000e-01 ... -3.81460000e-02
   8.53120000e-02 -8.35300000e-02]
 [ 4.87750000e+00 -9.08450000e-01 -5.72540000e-01 ...  3.60460000e-01
  -1.37750000e-01  1.83400000e-01]
 [ 4.88430000e+00 -3.85360000e-01  2.63690000e-01 ... -1.26360000e-02
  -2.33010000e-02  1.12220000e-01]]


In [12]:
pos_edges = 0
neg_edges = 0
for i in range(drugs.shape[0]):
  if labels[i] == 1:
    pos_edges += 1
  else :
    neg_edges += 1
print(pos_edges,neg_edges)

32230 322176


In [13]:
edge_index = []
pos_edge_index = []
neg_edge_index = []
for i in range(drugs.shape[0]):
  edge_index.append([drug_to_id[tuple(drugs[i,:])], target_to_id[tuple(targets[i,:])]])
  if labels[i] == 1:
    pos_edge_index.append([drug_to_id[tuple(drugs[i,:])], target_to_id[tuple(targets[i,:])]])
  else :
    neg_edge_index.append([drug_to_id[tuple(drugs[i,:])], target_to_id[tuple(targets[i,:])]])
edge_index = np.array(edge_index)
pos_edge_index = np.array(pos_edge_index)
neg_edge_index = np.array(neg_edge_index)
pos_edge_index_train = pos_edge_index[0:30000,:]
pos_edge_index_test = pos_edge_index[30000:,:]
neg_edge_index_train = neg_edge_index[0:300000,:]
neg_edge_index_test = neg_edge_index[300000:,:]
print(edge_index.shape)
print(pos_edge_index_train.shape)
print(neg_edge_index_train.shape)

(354406, 2)
(30000, 2)
(300000, 2)


In [14]:
nodes = torch.tensor(nodes)
edge_index = torch.tensor(edge_index)
pos_edge_index = torch.tensor(pos_edge_index)
pos_edge_index_train = torch.tensor(pos_edge_index_train)
pos_edge_index_test = torch.tensor(pos_edge_index_test)
neg_edge_index = torch.tensor(neg_edge_index)
neg_edge_index_train = torch.tensor(neg_edge_index_train)
neg_edge_index_test = torch.tensor(neg_edge_index_test)

In [15]:
data = Data(x=nodes, edge_index=edge_index.t().contiguous(), pos_edge_index=pos_edge_index.t().contiguous(), pos_edge_index_train=pos_edge_index_train.t().contiguous(), pos_edge_index_test=pos_edge_index_test.t().contiguous(), neg_edge_index=neg_edge_index.t().contiguous(), neg_edge_index_train=neg_edge_index_train.t().contiguous(), neg_edge_index_test=neg_edge_index_test.t().contiguous())
print(data)

Data(edge_index=[2, 354406], neg_edge_index=[2, 322176], neg_edge_index_test=[2, 22176], neg_edge_index_train=[2, 300000], pos_edge_index=[2, 32230], pos_edge_index_test=[2, 2230], pos_edge_index_train=[2, 30000], x=[13260, 300])


In [32]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 128)
        self.conv2 = GCNConv(128, 64)
        self.fc1 = Linear(128, 10)
        self.fc2 = Linear(10, 1)

    def encode(self):
        x = self.conv1(data.x, data.pos_edge_index_train) # convolution 1
        x = x.relu()
        return self.conv2(x, data.pos_edge_index_train) # convolution 2

    def decode(self, z, pos_edge_index, neg_edge_index): # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = torch.cat([z[edge_index[0]], z[edge_index[1]]], dim = -1)  # dot product
        logits = self.fc1(logits)
        logits = self.fc2(logits)
        logits = torch.squeeze(logits)
        return logits

    def decode_all(self, z): 
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 0).nonzero(as_tuple=False).t() # get predicted edge_list

model = Net()
model = model.double()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
print(model)

Net(
  (conv1): GCNConv(300, 128)
  (conv2): GCNConv(128, 64)
  (fc1): Linear(in_features=128, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=1, bias=True)
)


In [18]:
def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the lenght of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels

In [19]:
def train():
    model.train()

    # neg_edge_index = negative_sampling(
    #     edge_index=data.train_pos_edge_index, #positive edges
    #     num_nodes=data.num_nodes, # number of nodes
    #     num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges

    optimizer.zero_grad()
    
    z = model.encode() #encode
    link_logits = model.decode(z, data.pos_edge_index_train, data.neg_edge_index_train) # decode
    link_labels = get_link_labels(data.pos_edge_index_train, data.neg_edge_index_train)
    link_probs = link_logits.sigmoid()
    # link_preds = (link_probs > 0.5)
    # accuracy = (link_preds == link_labels).sum()
    # perf = roc_auc_score(link_labels.cpu(), link_probs.cpu()) #compute roc_auc score
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss

In [20]:
@torch.no_grad()
def test():
    model.eval()

    z = model.encode() # encode train
    link_logits = model.decode(z, data.pos_edge_index_train, data.neg_edge_index_train) # decode test or val
    link_probs = link_logits.sigmoid() # apply sigmoid
    
    link_labels = get_link_labels(data.pos_edge_index_train, data.neg_edge_index_train) # get link
    
    train_perf = roc_auc_score(link_labels.cpu(), link_probs.cpu()) #compute roc_auc score

    z = model.encode() # encode train
    link_logits = model.decode(z, data.pos_edge_index_test, data.neg_edge_index_test) # decode test or val
    link_probs = link_logits.sigmoid() # apply sigmoid
    
    link_labels = get_link_labels(data.pos_edge_index_test, data.neg_edge_index_test) # get link
    
    test_perf = roc_auc_score(link_labels.cpu(), link_probs.cpu()) #compute roc_auc score
    return train_perf, test_perf

In [33]:
best_perf = 0
for epoch in range(1, 1001):
    train_loss = train()
    train_perf, test_perf = test()
    log = 'Epoch: {:03d}, Loss: {:.4f}, Train: {:.4f}, Test: {:.4f}'
    print(log.format(epoch, train_loss, train_perf, test_perf))

Epoch: 001, Loss: 0.5894, Train: 0.2133, Test: 0.2138
Epoch: 002, Loss: 0.4626, Train: 0.3964, Test: 0.3459
Epoch: 003, Loss: 0.3799, Train: 0.7243, Test: 0.6993
Epoch: 004, Loss: 0.3677, Train: 0.6111, Test: 0.5915
Epoch: 005, Loss: 0.3281, Train: 0.6408, Test: 0.6223
Epoch: 006, Loss: 0.3268, Train: 0.7325, Test: 0.7052
Epoch: 007, Loss: 0.3000, Train: 0.7554, Test: 0.7231
Epoch: 008, Loss: 0.2982, Train: 0.7515, Test: 0.7223
Epoch: 009, Loss: 0.2653, Train: 0.7457, Test: 0.7227
Epoch: 010, Loss: 0.2716, Train: 0.7740, Test: 0.7527
Epoch: 011, Loss: 0.2508, Train: 0.8071, Test: 0.7897
Epoch: 012, Loss: 0.2482, Train: 0.8216, Test: 0.8079
Epoch: 013, Loss: 0.2540, Train: 0.8188, Test: 0.8085
Epoch: 014, Loss: 0.2439, Train: 0.8014, Test: 0.7976
Epoch: 015, Loss: 0.2487, Train: 0.8033, Test: 0.7969
Epoch: 016, Loss: 0.2496, Train: 0.8234, Test: 0.8056
Epoch: 017, Loss: 0.2411, Train: 0.8341, Test: 0.8068
Epoch: 018, Loss: 0.2428, Train: 0.8352, Test: 0.8014
Epoch: 019, Loss: 0.2426, Tr

KeyboardInterrupt: ignored

In [34]:
torch.save(model.state_dict(), 'drive/MyDrive/DrugOrchestra/model.pth')