# Analyzing results from ULTRA prediction

## Load data

In [56]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

datadir = "../data"
dataset = "hetionet"
outdir = "../outputs"
res_name = "ultra_50g-Hetionet.csv"
savename = "new_links_v0.csv"

train_df = pd.read_csv(f"{datadir}/{dataset}/train.txt", sep="\t", names=["source", "metaedge", "target"])
val_df = pd.read_csv(f"{datadir}/{dataset}/valid.txt", sep="\t", names=["source", "metaedge", "target"])
test_df = pd.read_csv(f"{datadir}/{dataset}/test.txt", sep="\t", names=["source", "metaedge", "target"])

pred_df = pd.read_csv(f"{outdir}/{res_name}")

In [50]:
def print_data_stats(df):
    print("Edge number: {}".format(len(df)))
    nodes = list(set(df["source"].tolist() + df["target"].to_list()))
    df["source_type"] = df["source"].str.split("::", expand=True)[0]
    df["target_type"] = df["target"].str.split("::", expand=True)[0]
    print("Node number: {}".format(len(nodes)))
    node_types = list(set(df["source_type"].tolist() + df["target_type"].to_list()))
    print("Node type number: {}".format(len(node_types)))


In [51]:
print("TRAIN:")
print_data_stats(train_df.copy())
print("-"*20)
print("\nVALID:")
print_data_stats(val_df.copy())
print("-"*20)
print("\nTEST:")
print_data_stats(test_df.copy())

TRAIN:
Edge number: 2025177
Node number: 44907
Node type number: 11
--------------------

VALID:
Edge number: 112510
Node number: 29250
Node type number: 11
--------------------

TEST:
Edge number: 112510
Node number: 29203
Node type number: 11


## New links

In [54]:
new_links_q1 = pred_df[["source", "metaedge", "target_pred"]]
new_links_q1.rename({"target_pred": "target"}, axis=1, inplace=True)
new_links_q2 = pred_df[["source_pred", "metaedge", "target"]]
new_links_q2.rename({"source_pred": "source"}, axis=1, inplace=True)
new_links = pd.concat([new_links_q1, new_links_q2], ignore_index=True)
print("Before removing duplications: {}".format(len(new_links)))
new_links.drop_duplicates(inplace=True, keep="first")
new_links.reset_index(inplace=True, drop=True)
print("Number of links recontructed from ULTRA: {}".format(len(new_links)))

# Remove links from original graph
new_links["key"] = new_links["source"] + "-" + new_links["metaedge"] + "-" + new_links["target"]
df = test_df.copy()
df["key"] = df["source"] + "-" + df["metaedge"] + "-" + df["target"]

overlaps = list(set(new_links["key"].tolist()) & set(df["key"].tolist()))
print("Overlap: {}".format(len(overlaps)))
new_links = new_links[~new_links["key"].isin(overlaps)]
print("New links: {}".format(len(new_links)))

Before removing duplications: 225020
Number of links recontructed from ULTRA: 70737
Overlap: 19641
New links: 51096


In [55]:
new_links[["source", "metaedge", "target"]].to_csv(f"{datadir}/{savename}", index=False)