In [70]:
import omnipath as op
import pandas as pd
import networkx as nx
import numpy as np
import scipy
import torch
import torch_geometric
from torch_geometric.data import Data
import sklearn

## Get OmniPath interactions, keep curated interactions and largest connected component

In [128]:
interactions = op.interactions.AllInteractions.get(genesymbols=True)
interactions = interactions[(interactions['consensus_direction']) & (interactions['n_references'] != 'None')]

In [2]:
df = pd.read_csv('../cell_cell_comm/omnipath_embeddings/data/omnipath_curated_interactions.csv', index_col = 0)
G = nx.from_pandas_edgelist(df=df, source='source_genesymbol', target='target_genesymbol', create_using=nx.DiGraph)
Gcc = sorted(nx.connected_components(nx.to_undirected(G)), key=len, reverse=True)
G = G.subgraph(Gcc[0])

<networkx.classes.digraph.DiGraph at 0x7f329eb631c0>

## Get MST for training edges, then sample from other edges until number of training edges reached. Use other edges for validation, testing

In [178]:
np.random.seed(0)
mstedges = [(x[0], x[1]) for x in tree.minimum_spanning_edges(nx.to_undirected(G))]

mstedges_dir = []
for edge in mstedges:
    count = 0
    if G.has_edge(edge[0], edge[1]):
        mstedges_dir.append((edge[0], edge[1]))
    elif G.has_edge(edge[1], edge[0]):
        mstedges_dir.append((edge[1], edge[0]))       

otheredges = np.array(list(set([(x[0], x[1]) for x in G.edges()]) - set(mstedges)))
mask=np.arange(len(otheredges))
np.random.shuffle(mask)
all_edges = len(G.edges())

In [179]:
val_perc=0.05
test_perc=0.1

val = otheredges[mask[:int(val_perc*all_edges)]]
test = otheredges[mask[int(val_perc*all_edges):int(val_perc*all_edges) + int(test_perc*all_edges)]]
train = np.vstack((np.array(mstedges_dir), otheredges[mask[int(val_perc*all_edges) + int(test_perc*all_edges):]]))

In [180]:
train_types = []
for edge in train:
    train_types.append(df[(df['source_genesymbol'] == edge[0]) & (df['target_genesymbol'] == edge[1])]['type'].iloc[0])

In [181]:
val_types = []
for edge in val:
    val_types.append(df[(df['source_genesymbol'] == edge[0]) & (df['target_genesymbol'] == edge[1])]['type'].iloc[0])

In [182]:
test_types = []
for edge in test:
    test_types.append(df[(df['source_genesymbol'] == edge[0]) & (df['target_genesymbol'] == edge[1])]['type'].iloc[0])

In [183]:
train_with_edge = np.hstack((train, np.array(train_types).reshape(-1,1)))[:, [0,2,1]]
val_with_edge = np.hstack((val, np.array(val_types).reshape(-1,1)))[:, [0,2,1]]
test_with_edge = np.hstack((test, np.array(test_types).reshape(-1,1)))[:, [0,2,1]]

In [184]:
np.savez('data/omnipath_curated_interactions.npz', train=train, val=val, test=test)

In [186]:
np.savez('data/omnipath_curated_interactions_with_edges.npz', train=train_with_edge, val=val_with_edge, test=test_with_edge)

## Get other interactions, keep curated interactions and largest connected component

In [66]:
interactions = op.interactions.AllInteractions.get(genesymbols=True)
interactions = interactions[(interactions['consensus_direction']) & (interactions['n_references'] != 'None')]

In [4]:
df = interactions[['source_genesymbol', 'type', 'target_genesymbol']].drop_duplicates()
G = nx.from_pandas_edgelist(df, source='source_genesymbol', target='target_genesymbol', create_using=nx.DiGraph)

In [254]:
db = 'SIGNOR'
type = 'post_translational'
interactions_sub = interactions[interactions['type'] == type]
df = interactions_sub[[db in x for x in interactions_sub['sources'].str.split(';')]]
G = nx.from_pandas_edgelist(df=df, source='source_genesymbol', target='target_genesymbol', create_using=nx.DiGraph)
Gcc = sorted(nx.connected_components(nx.to_undirected(G)), key=len, reverse=True)
print(G.subgraph(Gcc[0]).number_of_nodes() / G.number_of_nodes())
print(G.subgraph(Gcc[0]).number_of_edges() / G.number_of_edges())
G = G.subgraph(Gcc[0])

0.9817559863169898
0.9969014084507042


## Get MST for training edges, then sample from other edges until number of training edges reached. Use other edges for validation, testing

In [42]:
np.random.seed(0)
mstedges = [(x[0], x[1]) for x in tree.minimum_spanning_edges(nx.to_undirected(G))]

mstedges_dir = []
for edge in mstedges:
    count = 0
    if G.has_edge(edge[0], edge[1]):
        mstedges_dir.append((edge[0], edge[1]))
    elif G.has_edge(edge[1], edge[0]):
        mstedges_dir.append((edge[1], edge[0]))       

otheredges = np.array(list(set([(x[0], x[1]) for x in G.edges()]) - set(mstedges_dir)))
mask=np.arange(len(otheredges))
np.random.shuffle(mask)
all_edges = len(G.edges())

In [43]:
len(mstedges_dir)

1245

In [44]:
all_edges

2368

In [45]:
all_edges*0.85 # verify it's >= mstedges_dir, otherwise proportions are off

2012.8

In [46]:
val_perc=0.05
test_perc=0.1

val = otheredges[mask[:int(val_perc*all_edges)]]
test = otheredges[mask[int(val_perc*all_edges):int(val_perc*all_edges) + int(test_perc*all_edges)]]
train = np.vstack((np.array(mstedges_dir), otheredges[mask[int(val_perc*all_edges) + int(test_perc*all_edges):]]))

In [47]:
train.shape # verify train shape is close to all_edges*0.85

(2014, 2)

In [48]:
np.savez(f'data/{db}_curated_interactions.npz', train=train, val=val, test=test)