In [258]:
import sys
sys.path.append("../HiRES-2.0-BiModal/")
from EntityPairItem import EntityPairItem
from data import RelationGraph
from dgl import heterograph

import dgl
import numpy as np
import pickle
import joblib
import json
import torch
import pandas as pd
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [42]:
def loading_result(path):
    res = torch.load(path+"test/result.pth")
    score = np.vstack(res[0])
    true = np.array(res[2])
    y_true = label_binarize(true, classes=[1,2,3])

    precision = dict()
    recall = dict()
    threshold = dict()
    average_precision = dict()
    best_threshold = dict()
    for i in range(score.shape[1]):
        precision[i], recall[i], threshold[i] = precision_recall_curve(y_true[:, i], score[:, i])
        average_precision[i] = average_precision_score(y_true[:, i], score[:, i])
        f1 = 2*precision[i]*recall[i]/(precision[i] + recall[i] + 1e-10)
        best_threshold[i] = threshold[i][np.argmax(f1)]
    
    print("annotated set:")
    pres = torch.load(path+"pred/result.pth")
    pscore = np.vstack(pres[0])
    ptrue = np.array(pres[2])
    y_true = label_binarize(ptrue, classes=[1,2,3])
    y_pred = np.array(pscore >= np.array(list(best_threshold.values())), dtype=np.int8)
    for i in range(y_true.shape[1]):
        print(i, accuracy_score(y_true[:,i], y_pred[:,i]))
    print(accuracy_score(y_true.ravel(), y_pred.ravel()))
    print(classification_report(y_true, y_pred, digits=3))
    return y_pred, y_true

In [43]:
text_pred, text_true = loading_result("../HiRES-2.0-Text/result/proposal-bce-tucker/17/")

annotated set:
0 0.8376753507014028
1 0.8717434869739479
2 0.9238476953907816
0.8777555110220441
              precision    recall  f1-score   support

           0      0.812     0.800     0.806       210
           1      0.913     0.660     0.766       159
           2      0.904     0.770     0.832       122

   micro avg      0.862     0.747     0.800       491
   macro avg      0.876     0.744     0.801       491
weighted avg      0.867     0.747     0.800       491
 samples avg      0.735     0.735     0.735       491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
graph_pred, graph_true = loading_result("../HiRES-2.0-Graph/result/HAN_without_stys_TuckER/40/")

annotated set:
0 0.87374749498998
1 0.8937875751503006
2 0.9158316633266533
0.8944555778223113
              precision    recall  f1-score   support

           0      0.813     0.910     0.858       210
           1      0.957     0.698     0.807       159
           2      0.857     0.787     0.821       122

   micro avg      0.860     0.811     0.834       491
   macro avg      0.876     0.798     0.829       491
weighted avg      0.870     0.811     0.832       491
 samples avg      0.785     0.798     0.789       491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
bimodal_graph_pred, bimodal_graph_true = loading_result(
    "../HiRES-2.0-BiModal/result/proposal-bce-tucker-biW-MinLogit-distill/10/graph-")

annotated set:
0 0.87374749498998
1 0.8997995991983968
2 0.9218436873747495
0.8984635938543755
              precision    recall  f1-score   support

           0      0.813     0.910     0.858       210
           1      0.936     0.736     0.824       159
           2      0.903     0.762     0.827       122

   micro avg      0.866     0.817     0.841       491
   macro avg      0.884     0.803     0.836       491
weighted avg      0.875     0.817     0.839       491
 samples avg      0.796     0.804     0.798       491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
bimodal_text_pred, bimodal_text_true = loading_result(
    "../HiRES-2.0-BiModal/result/proposal-bce-tucker-biW-MinLogit-distill/15/text-")

annotated set:
0 0.843687374749499
1 0.8917835671342685
2 0.9238476953907816
0.8864395457581831
              precision    recall  f1-score   support

           0      0.797     0.843     0.819       210
           1      0.957     0.692     0.803       159
           2      0.896     0.779     0.833       122

   micro avg      0.862     0.778     0.818       491
   macro avg      0.883     0.771     0.819       491
weighted avg      0.873     0.778     0.818       491
 samples avg      0.766     0.766     0.766       491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [107]:
assert (bimodal_text_true == bimodal_graph_true).all() and\
(text_true == graph_true).all() and\
(bimodal_text_true == graph_true).all()
true = text_true

# Loading analysis data

In [124]:
with open("../data/D2/EntityPairItems-disodiso-gold-500-token-term.jl", "rb") as f:
    test_data = joblib.load(f)

In [173]:
diso_graph = RelationGraph("../data/D2/", 'DISO', 'DISO').transform()
h = torch.from_numpy(np.load("../data/D2/embedtable.npy")).float()
h.requires_grad_(requires_grad=False)
with open("../data/D2/cui2idx.pkl", "rb") as f:
    cui2idx = pickle.load(f)
g = heterograph(diso_graph,num_nodes_dict={'DISO':h.size(0)})

In [195]:
meta_paths = [['DDx'], 
            ['May Be Caused By'], 
            ['May Cause'],
            ['DDx','May Cause'],
            ['DDx','May Be Caused By'],
            ['May Cause','DDx'],
            ['May Be Caused By','DDx'],
            ['DDx','DDx'],
            ['May Cause','May Cause'],
            ['May Be Caused By','May Be Caused By']]

In [196]:
g_metas = [dgl.metapath_reachable_graph(g, meta_path) for meta_path in meta_paths]

In [228]:
def check_meta_reachable(idx0, idx1):
    metas = []
    for i, g_meta in enumerate(g_metas):
        head, tail = g_meta.edges(order="eid")
        indices_head = set(torch.where(head == idx0)[0].tolist())
        indices_tail = set(torch.where(tail == idx1)[0].tolist())
        if len(indices_head.intersection(indices_tail)) != 0:
            metas.append(meta_paths[i])
    return metas

In [244]:
def get_degree(idx):
    return [g.out_degrees(idx, etype) for etype in g.etypes]

# GSB analysis between with/without Graph

In [338]:
G = []
GS = []
BS = []
B = []
for idx, (text, bi_text, true_label) in enumerate(zip(text_pred, bimodal_text_pred, true)):
    if (text == bi_text).all():
        if (text == true_label).all():
            GS.append(idx)
        else:
            BS.append(idx)
    if not (text == true_label).all() and (bi_text == true_label).all():
        G.append(idx)
    if (text == true_label).all() and not (bi_text == true_label).all():
        B.append(idx)
print(len(G), len(GS), len(BS), len(B))

28 354 85 14


In [339]:
case_data = {
    "cui1": [],
    "cui2": [],
    "sentences": [],
    "sent_num": [],
    "cui1_degree": [],
    "cui2_degree": [],
    "meta_path_reachable": [],
    "label": [],
    "case_type": []}
for i in G:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("REMOD_G(text)")
for i in B:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("REMOD_B(text)")
df_case_data_text = pd.DataFrame(case_data)

# GSB analysis between with/without Text

In [340]:
G = []
GS = []
BS = []
B = []
for idx, (graph, bi_graph, true_label) in enumerate(zip(graph_pred, bimodal_graph_pred, true)):
    if (graph == bi_graph).all():
        if (graph == true_label).all():
            GS.append(idx)
        else:
            BS.append(idx)
    if not (graph == true_label).all() and (bi_graph == true_label).all():
        G.append(idx)
    if (graph == true_label).all() and not (bi_graph == true_label).all():
        B.append(idx)
print(len(G), len(GS), len(BS), len(B))

19 375 80 11


In [341]:
case_data = {
    "cui1": [],
    "cui2": [],
    "sentences": [],
    "sent_num": [],
    "cui1_degree": [],
    "cui2_degree": [],
    "meta_path_reachable": [],
    "label": [],
    "case_type": []}
for i in G:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("REMOD_G(graph)")
for i in B:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("REMOD_B(graph)")
df_case_data_graph = pd.DataFrame(case_data)

In [342]:
pd.concat([df_case_data_text, df_case_data_graph]).to_csv("REMOD_case_study_compare_with_single_modality.csv")

In [343]:
text_G = []
GS = []
BS = []
graph_G = []
for idx, (bi_text, bi_graph, true_label) in enumerate(zip(bimodal_text_pred, bimodal_graph_pred, true)):
    if (bi_text == bi_graph).all():
        if (bi_text == true_label).all():
            GS.append(idx)
        else:
            BS.append(idx)
    if not (bi_text == true_label).all() and (bi_graph == true_label).all():
        graph_G.append(idx)
    if (bi_text == true_label).all() and not (bi_graph == true_label).all():
        text_G.append(idx)
print(len(text_G), len(GS), len(BS), len(graph_G))

34 348 49 46


In [344]:
case_data = {
    "cui1": [],
    "cui2": [],
    "sentences": [],
    "sent_num": [],
    "cui1_degree": [],
    "cui2_degree": [],
    "meta_path_reachable": [],
    "label": [],
    "case_type": []}
for i in text_G:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("text_G(REMOD)")
for i in graph_G:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("graph_G(REMOD)")
df_case_data_remod = pd.DataFrame(case_data)

In [345]:
text_G = []
GS = []
BS = []
graph_G = []
for idx, (text, graph, true_label) in enumerate(zip(text_pred, graph_pred, true)):
    if (text == graph).all():
        if (text == true_label).all():
            GS.append(idx)
        else:
            BS.append(idx)
    if not (text == true_label).all() and (graph == true_label).all():
        graph_G.append(idx)
    if (text == true_label).all() and not (graph == true_label).all():
        text_G.append(idx)
print(len(text_G), len(GS), len(BS), len(graph_G))

31 337 59 49


In [346]:
case_data = {
    "cui1": [],
    "cui2": [],
    "sentences": [],
    "sent_num": [],
    "cui1_degree": [],
    "cui2_degree": [],
    "meta_path_reachable": [],
    "label": [],
    "case_type": []}
for i in text_G:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("text_G(Baseline)")
for i in graph_G:
    cui1 = test_data[i].cui1
    cui2 = test_data[i].cui2
    case_data["cui1"].append(test_data[i].cui1)
    case_data["cui2"].append(test_data[i].cui2)
    case_data["sentences"].append(test_data[i].sentences)
    case_data["sent_num"].append(len(test_data[i].sentences))
    case_data["cui1_degree"].append(json.dumps(get_degree(cui2idx[cui1])))
    case_data["cui2_degree"].append(json.dumps(get_degree(cui2idx[cui2])))
    case_data["meta_path_reachable"].append(json.dumps(check_meta_reachable(cui2idx[cui1], cui2idx[cui2])))
    case_data["label"].append(int(test_data[i].label))
    case_data["case_type"].append("graph_G(Baseline)")
df_case_data_baseline = pd.DataFrame(case_data)

In [347]:
pd.concat([df_case_data_remod, df_case_data_baseline]).to_csv("REMOD_case_study_compare_between_text_and_graph.csv")