In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD, PCA
import sklearn
from tqdm import tqdm
from collections import defaultdict

In [None]:
def directed_classifier(embedding, train_pairs, test_pairs):
    
    embedding.index = embedding.index.astype(str)
    train_pairs = train_pairs.astype(str)
    test_pairs = test_pairs.astype(str)
    
    X_train_pos = np.hstack((embedding.loc[train_pairs[:, 0]], embedding.loc[train_pairs[:, 1]]))
    X_train_neg = np.hstack((embedding.loc[train_neg_pairs[:, 0]], embedding.loc[train_neg_pairs[:, 1]]))
    X_train = np.vstack((X_train_pos, X_train_neg))
    y_train = [1]*(X_train_pos.shape[0]) + [0]*(X_train_neg.shape[0])

    X_test_pos = np.hstack((embedding.loc[test_pairs[:, 0]], embedding.loc[test_pairs[:, 1]]))
    X_test_neg = np.hstack((embedding.loc[test_neg_pairs[:, 0]], embedding.loc[test_neg_pairs[:, 1]]))
    X_test = np.vstack((X_test_pos, X_test_neg))
    y_test = [1]*(X_test_pos.shape[0]) + [0]*(X_test_neg.shape[0])

    clf = sklearn.linear_model.RidgeClassifier(random_state=0)
    clf.fit(X_train, y_train, )
    y_score_test = clf.decision_function(X_test)

    return ((sklearn.metrics.roc_auc_score(y_test, y_score_test), 
            sklearn.metrics.average_precision_score(y_test, y_score_test)))

## Directed Scattering without AE (best q, J)

In [None]:
for dataset in ['Texas', 'Cornell', 'omnipath', 'SIGNOR', 'iPTMnet']:
    config = np.load(f'results/DS-PM/{dataset}/best_val.npz', allow_pickle=True)['config'][()]
    best_q, best_J =  config['q'], config['J']
    print (dataset, best_q, best_J)
    df = pd.read_csv(f'results/Directed_Scattering/Directed_Scattering_J{best_J}_q{best_q}_{dataset}_train_val_embedding.csv', compression='gzip', index_col=0)
    
    dim = min(df.shape[1], 128)
    df_pc = pd.DataFrame(PCA(n_components=dim).fit_transform(df), index=df.index)
    
    splits = np.load(f'data/{dataset}_curated_interactions.npz')
    train, val, test = splits['train'], splits['val'], splits['test']
    
    all_data = np.vstack((train, val, test))
    
    train = np.vstack((train, val))
    
    train_neg_pairs = []
    for pair in tqdm(train):
        if len(set(np.where(all_data[:, 0] == pair[1])[0]).intersection(np.where(all_data[:, 1] == pair[0])[0])) == 0:
            train_neg_pairs.append([pair[1], pair[0]])

    train_neg_pairs = np.array(train_neg_pairs).reshape(-1,2)

    test_neg_pairs = []
    for pair in tqdm(test):
        if len(set(np.where(all_data[:, 0] == pair[1])[0]).intersection(np.where(all_data[:, 1] == pair[0])[0])) == 0:
            test_neg_pairs.append([pair[1], pair[0]])

    test_neg_pairs = np.array(test_neg_pairs).reshape(-1,2)
    
    train_neg_pairs = train_neg_pairs.astype(str)
    test_neg_pairs = test_neg_pairs.astype(str)
    
    auroc = directed_classifier(df_pc, train, test)[0]
    print (dataset, auroc)

## Run DS-PM with varying q and J 5 times

In [None]:
num_runs = 5

In [None]:
for dataset in ['Texas', 'Cornell', 'omnipath', 'SIGNOR', 'iPTMnet']:
    config = np.load(f'results/DS-PM/{dataset}/best_val.npz', allow_pickle=True)['config'][()]
    for q in [0.0, 0.1, 0.2]:
        for J in [5, 10, 15]:
            for i in range(num_runs):
                print (f"python test.py --model DS-PM --lr {config['lr']} --c {config['c']} --act linear --weight-decay {config['weight_decay']} --save-as best_test_ablated_q_{q}_J_{J}_{i} --q {q} --dataset {dataset} --J {J} --seed {1234+i}")

## Evaluate

In [None]:
results = []
for dataset in ['Texas', 'Cornell', 'SIGNOR', 'iPTMnet', 'omnipath']:
    splits = np.load(f'data/{dataset}_curated_interactions.npz')
    train, val, test = splits['train'], splits['val'], splits['test']
    
    all_data = np.vstack((train, val, test))
    
    train = np.vstack((train, val))
    
    train_neg_pairs = []
    for pair in tqdm(train):
        if len(set(np.where(all_data[:, 0] == pair[1])[0]).intersection(np.where(all_data[:, 1] == pair[0])[0])) == 0:
            train_neg_pairs.append([pair[1], pair[0]])

    train_neg_pairs = np.array(train_neg_pairs).reshape(-1,2)

    test_neg_pairs = []
    for pair in tqdm(test):
        if len(set(np.where(all_data[:, 0] == pair[1])[0]).intersection(np.where(all_data[:, 1] == pair[0])[0])) == 0:
            test_neg_pairs.append([pair[1], pair[0]])

    test_neg_pairs = np.array(test_neg_pairs).reshape(-1,2)
    
    train_neg_pairs = train_neg_pairs.astype(str)
    test_neg_pairs = test_neg_pairs.astype(str)
    
    for q in [0.0, 0.1, 0.2]:
        for J in [5, 10, 15]:
            for i in range(num_runs):
                res = np.load(f'results/DS-PM/{dataset}/best_test_ablated_q_{q}_J_{J}_{i}_results.npz', allow_pickle=True)
                res = pd.DataFrame(data=res['embedding'], index=res['names'])
                auroc = directed_classifier(res, train, test)[0]
                
                results.append([dataset, q, J, i, auroc])
                
results = pd.DataFrame(results)
results.columns = ['Dataset', 'q', 'J', 'run', 'AUROC']

## Get table

In [None]:
auroc_per_dataset = defaultdict(list)
for dataset in ['Texas', 'Cornell', 'omnipath', 'SIGNOR', 'iPTMnet']:
    
    res = results[results['Dataset'] == dataset]
    for q in [0.0, 0.1, 0.2]:
        for J in [5, 10, 15]:
            auroc_per_dataset[dataset].append(res[(res['q'] == q) & (res['J'] == J)]['AUROC'].mean())
            
auroc_per_dataset = pd.DataFrame(auroc_per_dataset.values())
auroc_per_dataset.index = ['Texas', 'Cornell', 'omnipath', 'SIGNOR', 'iPTMnet']
auroc_per_dataset = auroc_per_dataset.T

In [None]:
print ('Method', '&', ' & '.join(['Texas', 'Cornell', 'omnipath', 'SIGNOR', 'iPTMnet']), '\\\\')
print ('\hline\hline')
count = 0
for q in [0.0, 0.1, 0.2]:
    for J in [5, 10, 15]:
        print (f'q={q} J={J}', '&', ' & '.join(['%.3f'% x for x in auroc_per_dataset.loc[count].values]), "\\\\")
        count += 1