In [1]:
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import pdist, squareform, cdist
import numpy as np
import scipy, torch, phate, scprep
import torch_geometric.transforms as T
from sklearn import linear_model, metrics
import networkx as nx
import pandas as pd
from torch_geometric.data import Data
import os
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import scprep
from scipy.stats import spearmanr

In [3]:
np.random.seed(1234)
torch.manual_seed(1234)

interactions = pd.read_csv('../data/omnipath_curated_interactions.csv', index_col=0)

In [5]:
G = nx.from_pandas_edgelist(interactions, source='source_genesymbol', target='target_genesymbol', create_using=nx.DiGraph)
coo = scipy.sparse.coo_matrix(scipy.sparse.coo_matrix(nx.adjacency_matrix(G)))
values = coo.data
indices = np.vstack((coo.row, coo.col))

i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = coo.shape

In [6]:
data = Data(x=torch.Tensor(np.eye(G.number_of_nodes())), 
            edge_index=torch.sparse_coo_tensor(i, v, torch.Size(shape)).to_dense().nonzero().t().contiguous())

In [37]:
embedding_files = [x for x in os.listdir('results/MagNet') if x.endswith('existence_omnipath_embedding.npy')]
embeddings = dict(zip([x.split('_embedding.npy')[0] for x in embedding_files], [np.load(f'results/MagNet/{x}') for x in embedding_files]))

In [38]:
for k,v in embeddings.items():
    print (k, v.shape)

262_existence_omnipath (8909, 16)
194_existence_omnipath (8909, 16)
104_existence_omnipath (8909, 16)
4_existence_omnipath (8909, 16)
59_existence_omnipath (8909, 16)
82_existence_omnipath (8909, 16)
108_existence_omnipath (8909, 16)
270_existence_omnipath (8909, 16)
13_existence_omnipath (8909, 16)
267_existence_omnipath (8909, 16)
284_existence_omnipath (8909, 16)
276_existence_omnipath (8909, 16)
277_existence_omnipath (8909, 16)
272_existence_omnipath (8909, 16)
220_existence_omnipath (8909, 16)
282_existence_omnipath (8909, 16)
144_existence_omnipath (8909, 16)
105_existence_omnipath (8909, 16)
88_existence_omnipath (8909, 16)
223_existence_omnipath (8909, 16)
187_existence_omnipath (8909, 16)
252_existence_omnipath (8909, 16)
265_existence_omnipath (8909, 16)
230_existence_omnipath (8909, 16)
121_existence_omnipath (8909, 16)
16_existence_omnipath (8909, 16)
53_existence_omnipath (8909, 16)
213_existence_omnipath (8909, 16)
162_existence_omnipath (8909, 16)
227_existence_omnipath

## Random Link Split

In [40]:
transform = T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=False, add_negative_train_samples=True, )
train, val, test = transform(data, )

In [41]:
aurocs_val = {}
aurocs_test = {}

In [42]:
for method, embedding in {**embeddings}.items():
    X_train = np.hstack((embedding[train.edge_label_index[0]], embedding[train.edge_label_index[1]]))
    y_train = train.edge_label.detach().cpu().numpy()

    X_val = np.hstack((embedding[val.edge_label_index[0]], embedding[val.edge_label_index[1]]))
    y_val = val.edge_label.detach().cpu().numpy()
    
    X_test = np.hstack((embedding[test.edge_label_index[0]], embedding[test.edge_label_index[1]]))
    y_test = test.edge_label.detach().cpu().numpy()
    
    clf = linear_model.RidgeClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    y_score_val = clf.decision_function(X_val)
    y_score_test = clf.decision_function(X_test)

    aurocs_val[method] = metrics.roc_auc_score(y_val, y_score_val)
    aurocs_test[method] = metrics.roc_auc_score(y_test, y_score_test)

In [None]:
aurocs_val = dict(sorted(aurocs_val.items(), key=lambda item: item[1], reverse=False) )
best_run = list(aurocs_val.keys())[-1]

In [44]:
print(best_run, aurocs_test[best_run])

78_existence_omnipath 0.9489938603696803


## Link Split based on Directedness of Edge

## First split all positive edges

In [45]:
transform = T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=False, add_negative_train_samples=False, )
train, val, test = transform(data, )

In [46]:
aurocs_val = {}
aurocs_test = {}

In [47]:
A = nx.adj_matrix(G).todense()

Use `adjacency_matrix` instead

  A = nx.adj_matrix(G).todense()


In [48]:
train_id_neg_edges = np.where(A[train.edge_label_index[1], train.edge_label_index[0]] == 0)[1]
val_id_neg_edges = np.where(A[val.edge_label_index[1], val.edge_label_index[0]] == 0)[1]

In [49]:
for method, embedding in {**embeddings}.items():
        
    X_train_pos = np.hstack((embedding[train.edge_label_index[0]], embedding[train.edge_label_index[1]]))
    X_train_neg = np.hstack((embedding[train.edge_label_index[1][train_id_neg_edges]], embedding[train.edge_label_index[0][train_id_neg_edges]]))
    X_train = np.vstack((X_train_pos, X_train_neg))
    y_train = [1]*(X_train_pos.shape[0]) + [0]*(X_train_neg.shape[0])

    X_val_pos = np.hstack((embedding[val.edge_label_index[0]], embedding[val.edge_label_index[1]]))
    X_val_neg = np.hstack((embedding[val.edge_label_index[1][val_id_neg_edges]], embedding[val.edge_label_index[0][val_id_neg_edges]]))
    X_val = np.vstack((X_val_pos, X_val_neg))
    y_val = [1]*(X_val_pos.shape[0]) + [0]*(X_val_neg.shape[0])
    
    X_test_pos = np.hstack((embedding[test.edge_label_index[0]], embedding[test.edge_label_index[1]]))
    X_test_neg = np.hstack((embedding[test.edge_label_index[1]], embedding[test.edge_label_index[0]]))
    X_test = np.vstack((X_test_pos, X_test_neg))
    y_test = [1]*(X_test.shape[0]//2) + [0]*(X_test.shape[0]//2)
    
    clf = linear_model.RidgeClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    y_score_val = clf.decision_function(X_val)
    y_score_test = clf.decision_function(X_test)
    
    aurocs_val[method] = metrics.roc_auc_score(y_val, y_score_val)
    aurocs_test[method] = metrics.roc_auc_score(y_test, y_score_test)

In [None]:
aurocs_val = dict(sorted(aurocs_val.items(), key=lambda item: item[1], reverse=False) )
best_run = list(aurocs_val.keys())[-1]

In [51]:
print(best_run, aurocs_test[best_run])

46_existence_omnipath 0.7125151182658793
