# 使用 Masked LM 生成 Candidate Set 並 Retrieve Triplet from KG

In [1]:
"""
    This function will save
        (1) adjacency matrics (each in the form of a (R*N, N) coo sparse matrix)
        (2) concepts ids
        (3) qmask that specifices whether a node is a question concept
        (4) amask that specifices whether a node is a answer concept
        (5) cid2score that maps a concept id to its relevance score given the QA context
    to the output path in python pickle format

    grounded_path: str
    cpnet_graph_path: str
    cpnet_vocab_path: str
    output_path: str
    num_processes: int
"""

'\n    This function will save\n        (1) adjacency matrics (each in the form of a (R*N, N) coo sparse matrix)\n        (2) concepts ids\n        (3) qmask that specifices whether a node is a question concept\n        (4) amask that specifices whether a node is a answer concept\n        (5) cid2score that maps a concept id to its relevance score given the QA context\n    to the output path in python pickle format\n\n    grounded_path: str\n    cpnet_graph_path: str\n    cpnet_vocab_path: str\n    output_path: str\n    num_processes: int\n'

In [2]:
import torch
import networkx as nx
import itertools
import json
from tqdm import tqdm
from conceptnet import merged_relations
import numpy as np
from scipy import sparse
import pickle
from scipy.sparse import csr_matrix, coo_matrix
from multiprocessing import Pool
from collections import OrderedDict

# from .maths import *

In [3]:
statement_path = '/user_data/dggnn/data/MCQ/statememt/train.statement.json'
grounded_path = '/user_data/dggnn/data/MCQ/grounded/train.grounded.jsonl'
cpnet_graph_path = '/user_data/qagnn/data/cpnet/conceptnet.en.pruned.graph'
cpnet_vocab_path = '/user_data/qagnn/data/cpnet/concept.txt'

In [4]:
__all__ = ['generate_graph']

concept2id = None
id2concept = None
relation2id = None
id2relation = None

cpnet = None
cpnet_all = None
cpnet_simple = None

In [5]:
any(x is None for x in [concept2id, id2concept, relation2id, id2relation])

True

### 載入 relation 與 concept token 的字典， relation2id 為 relation 的 id ; concept2id 為 concept 的 id

In [6]:
def load_resources(cpnet_vocab_path):
    global concept2id, id2concept, relation2id, id2relation

    with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
        id2concept = [w.strip() for w in fin]
    concept2id = {w: i for i, w in enumerate(id2concept)}

    id2relation = merged_relations
    relation2id = {r: i for i, r in enumerate(id2relation)}

In [7]:
# 載入 concept 的 graph
def load_cpnet(cpnet_graph_path):
    global cpnet, cpnet_simple
    cpnet = nx.read_gpickle(cpnet_graph_path)
    cpnet_simple = nx.Graph()
    for u, v, data in cpnet.edges(data=True):
        w = data['weight'] if 'weight' in data else 1.0
        if cpnet_simple.has_edge(u, v):
            cpnet_simple[u][v]['weight'] += w
        else:
            cpnet_simple.add_edge(u, v, weight=w)

In [8]:
print(f'generating adj data for {grounded_path}...')

global concept2id, id2concept, relation2id, id2relation, cpnet_simple, cpnet
# 載入 concept2id, id2relation, relation2id
if any(x is None for x in [concept2id, id2concept, relation2id, id2relation]):
    load_resources(cpnet_vocab_path)
if cpnet is None or cpnet_simple is None:
    load_cpnet(cpnet_graph_path)

generating adj data for /user_data/dggnn/data/MCQ/grounded/train.grounded.jsonl...


### 查看 concept2id 內的東西

In [9]:
# dict of conceptnet token
# key = token name, value = id
concept2id

{'ab_extra': 0,
 'ab_intra': 1,
 'abactinal': 2,
 'actinal': 3,
 'abandon': 4,
 'acquire': 5,
 'arrogate': 6,
 'embrace': 7,
 'engage': 8,
 'gain': 9,
 'join': 10,
 'maintain': 11,
 'retain': 12,
 'unite': 13,
 'abandonment': 14,
 'acquisition': 15,
 'abapical': 16,
 'apical': 17,
 'abase': 18,
 'exalt': 19,
 'extoll': 20,
 'abash': 21,
 'embolden': 22,
 'reassure': 23,
 'abate': 24,
 'augment': 25,
 'abaxial': 26,
 'adaxial': 27,
 'abbreviate': 28,
 'lengthen': 29,
 'abderian': 30,
 'agelastic': 31,
 'abducent': 32,
 'adducent': 33,
 'abduction': 34,
 'adduction': 35,
 'abductive': 36,
 'deduce': 37,
 'abductor': 38,
 'abductee': 39,
 'adductor': 40,
 'abideable': 41,
 'insupportable': 42,
 'intolerable': 43,
 'unabideable': 44,
 'unbearable': 45,
 'abience': 46,
 'adience': 47,
 'abient': 48,
 'adient': 49,
 'ability': 50,
 'inability': 51,
 'abiogenesis': 52,
 'biogenesis': 53,
 'transformism': 54,
 'abjectly': 55,
 'proudly': 56,
 'abjugate': 57,
 'adjugate': 58,
 'able': 59,
 'can

In [10]:
concept2id['ab_extra']

0

In [11]:
id2concept[0]

'ab_extra'

### 查看 relation2id 內的東西

In [12]:
# dict of relation
# key = token name, value = id
relation2id

{'antonym': 0,
 'atlocation': 1,
 'capableof': 2,
 'causes': 3,
 'createdby': 4,
 'isa': 5,
 'desires': 6,
 'hassubevent': 7,
 'partof': 8,
 'hascontext': 9,
 'hasproperty': 10,
 'madeof': 11,
 'notcapableof': 12,
 'notdesires': 13,
 'receivesaction': 14,
 'relatedto': 15,
 'usedfor': 16}

In [13]:
id2relation

['antonym',
 'atlocation',
 'capableof',
 'causes',
 'createdby',
 'isa',
 'desires',
 'hassubevent',
 'partof',
 'hascontext',
 'hasproperty',
 'madeof',
 'notcapableof',
 'notdesires',
 'receivesaction',
 'relatedto',
 'usedfor']

## 將預處理的資料 整理成 QA Data Format
## Data format 資料格式
qa_data 格式 
- data : <tuple>
    - q_ids : <set> : q_ids 所有在 question 句子中的 phase verb noun 的 id
    - a_ids : <set> : a_ids 所有在 answer 句子中的 phase verb noun 的 id
    - QAcontext : <str> : 綜合 Question 以及 Answer 的文字，用空白互相格開

In [14]:
with open(statement_path, 'r', encoding = 'utf-8') as f:
    statements = json.load(f)

In [15]:
qa_data = []
with open(grounded_path, 'r', encoding='utf-8') as fin_ground:
    lines_ground = fin_ground.readlines()
    for j, line in enumerate(lines_ground):
        dic = json.loads(line)
        q_ids = set(concept2id[c] for c in dic['qc'])
        a_ids = set(concept2id[c] for c in dic['ac'])
        q_ids = q_ids - a_ids
        sentence = statements[j]['sentence'].replace('**blank**','[MASK]')
        QAcontext = "{}.[SEP] {}.".format(sentence, dic['ans'])
        qa_data.append((q_ids, a_ids, QAcontext))

In [16]:
print('總共有',len(qa_data),'筆')

總共有 2321 筆


In [17]:
qa_data[0]

({3001, 5634, 10932, 13793, 19223, 33893, 408089},
 {7801},
 '[MASK] causes rocks to roll downhill.[SEP] gravity.')

## Data format 資料格式
qa_data 格式 
- data : <tuple>
    - q_ids : <set> : q_ids 所有在 question 句子中的 phase verb noun 的 id
    - a_ids : <set> : a_ids 所有在 answer 句子中的 phase verb noun 的 id
    - QAcontext : <str> : 綜合 Question 以及 Answer 的文字，用空白互相格開

### 載入 cdgp-csg-scibert-dgen 來找 distractor 並用來 retrieve 其他的 relevant node 

In [55]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline

tokenizer = BertTokenizer.from_pretrained("AndyChiang/cdgp-csg-scibert-dgen")
csg_model = BertForMaskedLM.from_pretrained("AndyChiang/cdgp-csg-scibert-dgen")

### K = 20，找 20 個 prediction prob 最高的當 distractor

In [56]:
unmasker = pipeline("fill-mask", tokenizer=tokenizer, model=csg_model, top_k=15)

In [57]:
## example
sent = "The only known planet with large amounts of water is [MASK]. [SEP] earth"
cs = unmasker(sent)
print(cs[0])

{'score': 0.3574279248714447, 'token': 15497, 'token_str': 'mars', 'sequence': 'the only known planet with large amounts of water is mars. earth'}


In [58]:
def append_extra_node_use_LM(data):
    qc_ids, ac_ids, question = data
    extra_nodes = []
    cid2score  = []
    distractors_set = unmasker(question)

    # 代表 question 的句子 有 2 個以上 的 [Mask]
    if len(distractors_set) < 5:
        for each_distractor_set in distractors_set:
            for distractor in each_distractor_set:
                if distractor['token_str'] != (question.split(' '))[-1] and distractor['token_str'] in concept2id:
                    extra_nodes_ids = concept2id[distractor['token_str']]
                    extra_nodes.append(extra_nodes_ids)
    else:

        for distractor in distractors_set:
            if distractor['token_str'] != (question.split(' '))[-1] and distractor['token_str'] in concept2id:
                extra_nodes_ids = concept2id[distractor['token_str']]
                extra_nodes.append(extra_nodes_ids)
    extra_nodes = set(extra_nodes)
    return (sorted(qc_ids), sorted(ac_ids), question, extra_nodes)

In [59]:
res1 = list(tqdm(map(append_extra_node_use_LM, qa_data), total=len(qa_data)))

100%|██████████| 2321/2321 [01:56<00:00, 20.00it/s]


In [60]:
len(res1)

2321

In [61]:
# def find_relevant_node_with_ans_and_distractor(data):
#     qc_ids, ac_ids, question, extra_ids = data
#     a_ids = ac_ids[0] if len(ac_ids) == 1 else 0
#     extra_nodes = set()
#     for eid in extra_ids:
#         if eid != a_ids and eid in cpnet_simple.nodes and a_ids in cpnet_simple.nodes:
#             extra_nodes |= set(cpnet_simple[eid]) & set(cpnet_simple[a_ids])
#     extra_nodes = extra_nodes - (set(extra_ids) | set(qc_ids) | set(ac_ids))
#     return (sorted(qc_ids), sorted(ac_ids), question, extra_ids, sorted(extra_nodes))


In [62]:
# res2 = list(tqdm(map(find_relevant_node_with_ans_and_distractor, res1), total=len(res1)))

## generate_triplet_use_KG

In [63]:
def concepts2adj(question_ids,candidate_ids):
    global id2relation
    n_rel = len(id2relation)
    triplets = []
    a_ids = question_ids[-1]

    # type 2 考慮 qa_context 與 各個 distractor 的關係
    for q_ids in question_ids:
        for d_ids in candidate_ids:
            if cpnet.has_edge(q_ids, d_ids) and q_ids != d_ids:
                 for e_attr in cpnet[q_ids][d_ids].values():
                    if e_attr['rel'] >= 0 and e_attr['rel'] < n_rel and e_attr['weight'] >= 1.0:
                        triplets.append([int(e_attr['rel']), int(q_ids), int(d_ids), e_attr['weight']])

    
    for d_ids in candidate_ids:
        for q_ids in question_ids:
            if cpnet.has_edge(d_ids, q_ids) and q_ids != d_ids:
                for e_attr in cpnet[d_ids][q_ids].values():
                    if e_attr['rel'] >= 0 and e_attr['rel'] < n_rel and e_attr['weight'] >= 1.0:
                        triplets.append([int(e_attr['rel']), int(d_ids), int(q_ids), e_attr['weight']])

    return triplets

In [64]:
def concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3(data):
    qc_ids, ac_ids, question, extra_nodes_ids = data
    triplets = concepts2adj(qc_ids + ac_ids,extra_nodes_ids)
    return {'triplets': triplets}

In [65]:
res2 = list(tqdm(map(concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3, res1), total=len(res1)))

100%|██████████| 2321/2321 [00:00<00:00, 8030.94it/s]


In [66]:
res2[0]['triplets']

[[15, 13793, 11176, 1.33],
 [7, 13793, 14461, 2.0],
 [15, 13793, 16095, 2.0],
 [3, 7801, 7537, 1.0],
 [15, 7537, 7801, 1.984],
 [15, 214, 10932, 1.0],
 [15, 13788, 10932, 3.722],
 [15, 13788, 19223, 3.716]]

In [67]:
len(res2)

2321

In [68]:
res3 = []

for item in tqdm(res2):
    temp_list = []
    for triplets in item['triplets']:
        rel, source_node, target_node, weight = triplets
        relation = id2relation[rel]
        source = id2concept[source_node]
        target = id2concept[target_node]
        temp_list.append([relation, source, target, weight])
    res3.append(temp_list)

100%|██████████| 2321/2321 [00:00<00:00, 54923.55it/s]


In [1]:
len(res3)

NameError: name 'res3' is not defined

In [70]:
for i in range(6):
    print(res3[i])

[['relatedto', 'roll', 'motion', 1.33], ['hassubevent', 'roll', 'sound', 2.0], ['relatedto', 'roll', 'wind', 2.0], ['causes', 'gravity', 'weight', 1.0], ['relatedto', 'weight', 'gravity', 1.984], ['relatedto', 'erosion', 'rock', 1.0], ['relatedto', 'sand', 'rock', 3.722], ['relatedto', 'sand', 'rocks', 3.716]]
[['relatedto', 'meter', 'second', 1.0], ['relatedto', 'meter', 'length', 1.0], ['relatedto', 'meter', 'length', 1.0], ['relatedto', 'degree', 'measurement', 6.094], ['relatedto', 'weight', 'measurement', 5.541], ['relatedto', 'volume', 'length', 1.0], ['relatedto', 'meters', 'meter', 1.0], ['isa', 'diameter', 'length', 2.0], ['relatedto', 'minute', 'measurement', 3.618]]
[['relatedto', 'touch', 'sensation', 2.811], ['relatedto', 'touch', 'use', 1.153]]
[['relatedto', 'food', 'meat', 1.0], ['usedfor', 'animal', 'food', 2.0], ['relatedto', 'mineral', 'chemical', 1.0], ['isa', 'water', 'food', 2.0], ['isa', 'water', 'chemical', 2.0], ['atlocation', 'meat', 'food', 1.0], ['isa', 'mea

In [71]:
output_path = '/user_data/dggnn/data/MCQ/triplet/candidate_scibert_train_triplet_top15.json'

In [72]:
with open(output_path, 'w') as fout:
    json.dump(res3, fout)
print(f'data saved to {output_path}')

data saved to /user_data/dggnn/data/MCQ/triplet/candidate_scibert_train_triplet_top15.json


In [73]:
rel, source_node, target_node, weight = res2[0]['triplets'][0]
print('relation = ',id2relation[rel])
print('source_node = ',id2concept[source_node])
print('target_node = ',id2concept[target_node])
print('weight = ',weight)

relation =  relatedto
source_node =  roll
target_node =  motion
weight =  1.33
