In [None]:
import omnipath as op
import pandas as pd
import networkx as nx
import numpy as np
import scipy
import torch
import torch_geometric
from torch_geometric.data import Data
from sklearnex import patch_sklearn
import sklearn
patch_sklearn()
from run.run_ae_default_config import *
from tqdm import tqdm
import scprep, phate
import warnings; warnings.filterwarnings('ignore')

In [None]:
dataset = 'Texas'

In [None]:
splits = np.load(f'data/{dataset}_curated_interactions.npz')
train, val, test = splits['train'], splits['val'], splits['test']
all_data = np.vstack((train, val,test))

In [None]:
train_neg_pairs = []
for pair in train:
    if len(set(np.where(all_data[:, 0] == pair[1])[0]).intersection(np.where(all_data[:, 1] == pair[0])[0])) == 0:
        train_neg_pairs.append([pair[1], pair[0]])

train_neg_pairs = np.array(train_neg_pairs).reshape(-1,2)

val_neg_pairs = []
for pair in val:
    if len(set(np.where(all_data[:, 0] == pair[1])[0]).intersection(np.where(all_data[:, 1] == pair[0])[0])) == 0:
        val_neg_pairs.append([pair[1], pair[0]])

val_neg_pairs = np.array(val_neg_pairs).reshape(-1,2)

In [None]:
train_neg_pairs = train_neg_pairs.astype(str)
val_neg_pairs = val_neg_pairs.astype(str)

In [None]:
def directed_classifier(embedding, train_pairs, val_pairs):
    embedding.index = embedding.index.astype(str)
    train_pairs = train_pairs.astype(str)
    val_pairs = val_pairs.astype(str)

    X_train_pos = np.hstack((embedding.loc[train_pairs[:, 0]], embedding.loc[train_pairs[:, 1]]))
    X_train_neg = np.hstack((embedding.loc[train_neg_pairs[:, 0]], embedding.loc[train_neg_pairs[:, 1]]))
    X_train = np.vstack((X_train_pos, X_train_neg))
    y_train = [1]*(X_train_pos.shape[0]) + [0]*(X_train_neg.shape[0])

    X_val_pos = np.hstack((embedding.loc[val_pairs[:, 0]], embedding.loc[val_pairs[:, 1]]))
    X_val_neg = np.hstack((embedding.loc[val_neg_pairs[:, 0]], embedding.loc[val_neg_pairs[:, 1]]))
    X_val = np.vstack((X_val_pos, X_val_neg))
    y_val = [1]*(X_val_pos.shape[0]) + [0]*(X_val_neg.shape[0])

    clf = sklearn.linear_model.RidgeClassifier(random_state=0)
    clf.fit(X_train, y_train, )
    y_score_val = clf.decision_function(X_val)

    return ((sklearn.metrics.roc_auc_score(y_val, y_score_val), 
            sklearn.metrics.average_precision_score(y_val, y_score_val)))

### DS-AE

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/DS-AE/{dataset}')])
maxrun

In [None]:
best_performance = -1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/DS-AE/{dataset}/{i}_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    try:
        res = pd.DataFrame(data=res['embedding'], index=res['names'])
    except:
        continue
    auroc = directed_classifier(res, train, val)[0]
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### DS-PM

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/DS-PM/{dataset}')])
maxrun

In [None]:
best_performance=-1
for i in range(maxrun+1):
    try:
        res = np.load(f'results/DS-PM/{dataset}/{i}_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### Node2Vec

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/Node2Vec/') if x.endswith(f'_{dataset}_results.npz')])
maxrun

In [None]:
best_performance=-1
for i in range(maxrun+1):
    res = np.load(f'results/Node2Vec/{i}_{dataset}_results.npz', allow_pickle=True)
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    auroc = directed_classifier(res, train, val)[0]
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### MagNet

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/MagNet/') if x.endswith('npz')])
maxrun

In [None]:
performances = {}
for i in range(maxrun+1):
    try:
        res = np.load(f'results/MagNet/{i}_{dataset}_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    auroc = directed_classifier(res, train, val)[0]
    performances[i] = auroc

In [None]:
for k,v in dict(sorted(performances.items(), key=lambda item: item[1], reverse=True)).items():
    config = np.load(f'results/MagNet/{k}_{dataset}_results.npz', allow_pickle=True)['config'][()]
    if config['weight_decay'] == 0.001:
        continue
    else:
        print (k,v)

### UDS-AE

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/UDS-AE/{dataset}') if x.endswith('npz')])
maxrun

In [None]:
best_performance=-1
for i in range(maxrun+1):
    try:
        res = np.load(f'results/UDS-AE/{dataset}/{i}_{dataset}_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    auroc = directed_classifier(res, train, val)[0]
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### TransE

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/TransE/') if x.endswith(f'_{dataset}_noedgeatt_results.npz')])
maxrun

In [None]:
best_performance=-1
for i in range(maxrun+1):
    try:
        res = np.load(f'results/TransE/{i}_{dataset}_noedgeatt_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### HGCN undirected

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/HGCN/') if x.endswith(f'_{dataset}_undirected_results.npz')])
maxrun

In [None]:
best_performance=-1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/HGCN/{i}_{dataset}_undirected_results.npz', allow_pickle=True)
    except:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### HGCN directed

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/HGCN/') if x.endswith(f'_{dataset}_directed_results.npz')])
maxrun

In [None]:
best_performance=-1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/HGCN/{i}_{dataset}_directed_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### PM undirected

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/Shallow/') if x.endswith(f'_{dataset}_undirected_results.npz')])
maxrun

In [None]:
best_performance=-1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/Shallow/{i}_{dataset}_undirected_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### PM directed

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/Shallow/') if x.endswith(f'_{dataset}_directed_results.npz')])
maxrun

In [None]:
best_performance=-1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/Shallow/{i}_{dataset}_directed_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

### GAE

In [None]:
maxrun = max([int(x.split('_')[0]) for x in os.listdir(f'results/GAE/{dataset}')])
maxrun

In [None]:
best_performance=-1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/GAE/{dataset}/{i}_{dataset}_undirected_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance

In [None]:
best_performance=-1
for i in tqdm(range(maxrun+1)):
    try:
        res = np.load(f'results/GAE/{dataset}/{i}_{dataset}_directed_results.npz', allow_pickle=True)
    except FileNotFoundError:
        continue
    config = res['config']
    res = pd.DataFrame(data=res['embedding'], index=res['names'])
    try:
        auroc = directed_classifier(res, train, val)[0]
    except ValueError:
        continue
    if auroc > best_performance:
        best_performance = auroc
        best_config = i
best_config, best_performance