## 用 ChatGPT 與 MLM 的 Candidate Set 找相關 relevant node 並存成 Triplet format

In [1]:
import torch
import networkx as nx
import itertools
import json
from tqdm import tqdm
from conceptnet import merged_relations
import numpy as np
from scipy import sparse
import pickle
from scipy.sparse import csr_matrix, coo_matrix
from multiprocessing import Pool
from collections import OrderedDict


# from .maths import *

In [2]:
grounded_path = '/user_data/dggnn/data/MCQ/grounded/test.grounded.jsonl'
cpnet_graph_path = '/user_data/qagnn/data/cpnet/conceptnet.en.pruned.graph'
cpnet_vocab_path = '/user_data/qagnn/data/cpnet/concept.txt'

In [3]:
__all__ = ['generate_graph']

concept2id = None
id2concept = None
relation2id = None
id2relation = None

cpnet = None
cpnet_all = None
cpnet_simple = None

In [4]:
any(x is None for x in [concept2id, id2concept, relation2id, id2relation])

True

### 載入 relation 與 concept token 的字典， relation2id 為 relation 的 id ; concept2id 為 concept 的 id

In [5]:
def load_resources(cpnet_vocab_path):
    global concept2id, id2concept, relation2id, id2relation

    with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
        id2concept = [w.strip() for w in fin]
    concept2id = {w: i for i, w in enumerate(id2concept)}

    id2relation = merged_relations
    relation2id = {r: i for i, r in enumerate(id2relation)}

In [6]:
# 載入 concept 的 graph
def load_cpnet(cpnet_graph_path):
    global cpnet, cpnet_simple
    cpnet = nx.read_gpickle(cpnet_graph_path)
    cpnet_simple = nx.Graph()
    for u, v, data in cpnet.edges(data=True):
        w = data['weight'] if 'weight' in data else 1.0
        if cpnet_simple.has_edge(u, v):
            cpnet_simple[u][v]['weight'] += w
        else:
            cpnet_simple.add_edge(u, v, weight=w)

In [7]:
print(f'generating adj data for {grounded_path}...')

global concept2id, id2concept, relation2id, id2relation, cpnet_simple, cpnet
# 載入 concept2id, id2relation, relation2id
if any(x is None for x in [concept2id, id2concept, relation2id, id2relation]):
    load_resources(cpnet_vocab_path)
if cpnet is None or cpnet_simple is None:
    load_cpnet(cpnet_graph_path)

generating adj data for /user_data/dggnn/data/MCQ/grounded/test.grounded.jsonl...


### 查看 concept2id 內的東西

In [8]:
# dict of conceptnet token
# key = token name, value = id
concept2id

{'ab_extra': 0,
 'ab_intra': 1,
 'abactinal': 2,
 'actinal': 3,
 'abandon': 4,
 'acquire': 5,
 'arrogate': 6,
 'embrace': 7,
 'engage': 8,
 'gain': 9,
 'join': 10,
 'maintain': 11,
 'retain': 12,
 'unite': 13,
 'abandonment': 14,
 'acquisition': 15,
 'abapical': 16,
 'apical': 17,
 'abase': 18,
 'exalt': 19,
 'extoll': 20,
 'abash': 21,
 'embolden': 22,
 'reassure': 23,
 'abate': 24,
 'augment': 25,
 'abaxial': 26,
 'adaxial': 27,
 'abbreviate': 28,
 'lengthen': 29,
 'abderian': 30,
 'agelastic': 31,
 'abducent': 32,
 'adducent': 33,
 'abduction': 34,
 'adduction': 35,
 'abductive': 36,
 'deduce': 37,
 'abductor': 38,
 'abductee': 39,
 'adductor': 40,
 'abideable': 41,
 'insupportable': 42,
 'intolerable': 43,
 'unabideable': 44,
 'unbearable': 45,
 'abience': 46,
 'adience': 47,
 'abient': 48,
 'adient': 49,
 'ability': 50,
 'inability': 51,
 'abiogenesis': 52,
 'biogenesis': 53,
 'transformism': 54,
 'abjectly': 55,
 'proudly': 56,
 'abjugate': 57,
 'adjugate': 58,
 'able': 59,
 'can

In [9]:
concept2id['ab_extra']

0

In [10]:
id2concept[0]

'ab_extra'

### 查看 relation2id 內的東西

In [11]:
# dict of relation
# key = token name, value = id
relation2id

{'antonym': 0,
 'atlocation': 1,
 'capableof': 2,
 'causes': 3,
 'createdby': 4,
 'isa': 5,
 'desires': 6,
 'hassubevent': 7,
 'partof': 8,
 'hascontext': 9,
 'hasproperty': 10,
 'madeof': 11,
 'notcapableof': 12,
 'notdesires': 13,
 'receivesaction': 14,
 'relatedto': 15,
 'usedfor': 16}

In [12]:
id2relation

['antonym',
 'atlocation',
 'capableof',
 'causes',
 'createdby',
 'isa',
 'desires',
 'hassubevent',
 'partof',
 'hascontext',
 'hasproperty',
 'madeof',
 'notcapableof',
 'notdesires',
 'receivesaction',
 'relatedto',
 'usedfor']

## 將預處理的資料 整理成 QA Data Format
## Data format 資料格式
qa_data 格式 
- data : <tuple>
    - q_ids : <set> : q_ids 所有在 question 句子中的 phase verb noun 的 id
    - a_ids : <set> : a_ids 所有在 answer 句子中的 phase verb noun 的 id
    - QAcontext : <str> : 綜合 Question 以及 Answer 的文字，用空白互相格開

In [13]:
qa_data = []
statement_path = grounded_path.replace('grounded', 'statement')
statement_path

'/user_data/dggnn/data/MCQ/statement/test.statement.jsonl'

In [14]:
qa_data = []
with open(grounded_path, 'r', encoding='utf-8') as fin_ground:
    lines_ground = fin_ground.readlines()
    for j, line in enumerate(lines_ground):
        dic = json.loads(line)
        q_ids = set(concept2id[c] for c in dic['qc'])
        a_ids = set(concept2id[c] for c in dic['ac'])
        q_ids = q_ids - a_ids
        sentence = dic['sent'].replace(dic['ans'],'[MASK]')
        QAcontext = "{}.[SEP] {}.".format(sentence, dic['ans'])
        qa_data.append((q_ids, a_ids, QAcontext))

In [15]:
print('總共有',len(qa_data),'筆')

總共有 259 筆


In [16]:
qa_data[0]

({928,
  3996,
  4958,
  10627,
  10998,
  12541,
  15816,
  41365,
  74204,
  79152,
  80083,
  195118,
  380952,
  387476},
 {172893},
 '[MASK] is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species.[SEP] pheromone.')

In [17]:
dic = json.loads(line)
dic

{'sent': 'In sexual reproduction , sperm is the name of the gamete cell the male must contribute ',
 'ans': 'sperm',
 'qc': ['cell',
  'contribute',
  'gamete',
  'male',
  'must',
  'name',
  'reproduction',
  'sexual',
  'sexual_reproduction'],
 'ac': ['sperm']}

In [18]:
q_ids = set(concept2id[c] for c in dic['qc'])
a_ids = set(concept2id[c] for c in dic['ac'])
print('q_ids ',q_ids)
print('a_ids ',a_ids)

q_ids  {3013, 5414, 11274, 129516, 11310, 363984, 1460, 1462, 12119}
a_ids  {6010}


In [19]:
last_data = qa_data[-1]
print('q_ids = ',last_data[0])
print('a_ids = ',last_data[1])
print('QAcontext = ',last_data[2])

q_ids =  {3013, 5414, 11274, 129516, 11310, 363984, 1460, 1462, 12119}
a_ids =  {6010}
QAcontext =  In sexual reproduction , [MASK] is the name of the gamete cell the male must contribute .[SEP] sperm.


## concepts_to_adj_matrices_2hop_all_pair__use_LM__Part1

In [20]:
qc_ids, ac_ids, question = qa_data[-1]
# 合併 qc_ids 與 ac_ids
qa_nodes = set(qc_ids) | set(ac_ids)
qa_nodes

{1460, 1462, 3013, 5414, 6010, 11274, 11310, 12119, 129516, 363984}

In [21]:
question.replace('vector','[MASK]',1)

'In sexual reproduction , [MASK] is the name of the gamete cell the male must contribute .[SEP] sperm.'

### 載入 ChatGPT Candidate Set 並用來 retrieve 其他的 relevant node 

In [22]:
def read_chatgpt_data(item):
    path = '../candidate_data/chatgpt_prompt1/chatgpt_{}_candidate(cleaned).json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [23]:
chat_data = read_chatgpt_data('test')

In [24]:
print('ChatGPT')
print('question in first data = ',chat_data[0]['sentence'])
print('answer in first data = ',chat_data[0]['answer'])
print('distractors in first data = ',chat_data[0]['distractors'])
print('candidate_set in first data = ',chat_data[0]['candidate_set'])

ChatGPT
question in first data =  **blank** is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species
answer in first data =  pheromone
distractors in first data =  ['enzyme', 'isolate', 'amino']
candidate_set in first data =  ['Hormone', 'Enzyme', 'Antibody', 'Protease', 'Neurotransmitter', 'Toxin', 'Antioxidant', 'Catalyst', 'Carbohydrate', 'Lipid']


In [25]:
for i in range(len(qa_data)):
    print(i)
    candidate = chat_data[i]['candidate_set']
    pred = []
    for x in candidate:
        pred.append(x.lower())
    
    qa_data[i] = qa_data[i] + tuple(pred[:10])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258


In [26]:
qa_data[0]

({928,
  3996,
  4958,
  10627,
  10998,
  12541,
  15816,
  41365,
  74204,
  79152,
  80083,
  195118,
  380952,
  387476},
 {172893},
 '[MASK] is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species.[SEP] pheromone.',
 'hormone',
 'enzyme',
 'antibody',
 'protease',
 'neurotransmitter',
 'toxin',
 'antioxidant',
 'catalyst',
 'carbohydrate',
 'lipid')

## 加入 MLM Candidate Set 並用來 retrieve 其他的 relevant node 

In [27]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline

tokenizer = BertTokenizer.from_pretrained("AndyChiang/cdgp-csg-scibert-dgen")
csg_model = BertForMaskedLM.from_pretrained("AndyChiang/cdgp-csg-scibert-dgen")

2023-04-14 08:10:35.415144: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-14 08:10:35.533283: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-14 08:10:35.929993: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-04-14 08:10:35.930150: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

### K = 10，找 10 個 prediction prob 最高的當 distractor

In [28]:
unmasker = pipeline("fill-mask", tokenizer=tokenizer, model=csg_model, top_k=10)

In [29]:
## example
sent = "The only known planet with large amounts of water is [MASK]. [SEP] earth"
cs = unmasker(sent)
print(cs[0])

{'score': 0.3574290871620178, 'token': 15497, 'token_str': 'mars', 'sequence': 'the only known planet with large amounts of water is mars. earth'}


In [30]:
for i in range(len(qa_data)):
    sentence = chat_data[i]['sentence'].replace('**blank**','[MASK]')
    pred = []
    mlm_distractors_set = unmasker(sentence)
    if len(mlm_distractors_set) < 5:
        for each_distractor_set in mlm_distractors_set:
            for distractor in each_distractor_set:
                if distractor['token_str'] not in qa_data[i]:
                    pred.append(distractor['token_str'].lower())
    else:
        for distractor in mlm_distractors_set:
            if distractor['token_str'] not in qa_data[i]:
                pred.append(distractor['token_str'].lower())
    
    qa_data[i] = qa_data[i] + tuple(pred[:10])

In [31]:
qa_data[0]

({928,
  3996,
  4958,
  10627,
  10998,
  12541,
  15816,
  41365,
  74204,
  79152,
  80083,
  195118,
  380952,
  387476},
 {172893},
 '[MASK] is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species.[SEP] pheromone.',
 'hormone',
 'enzyme',
 'antibody',
 'protease',
 'neurotransmitter',
 'toxin',
 'antioxidant',
 'catalyst',
 'carbohydrate',
 'lipid',
 'chemical',
 'animal',
 'cocaine',
 'predator',
 'pain',
 'taste',
 'drug',
 'light',
 'fish',
 'physiology')

In [32]:
type(qa_data[1])

tuple

In [33]:
len(qa_data)

259

In [34]:
len(qa_data[0])

23

In [35]:
def concepts_to_adj_matrices_2hop_all_pair__use_LM__Part1(data):
    qc_ids = data[0]
    ac_ids = data[1]
    question = data[2]
    distractors_set = data[3:]
    extra_nodes = []
    for distractor in distractors_set:
        if distractor in concept2id:
            extra_nodes_ids = concept2id[distractor] 
            extra_nodes.append(extra_nodes_ids)
    return (sorted(qc_ids), sorted(ac_ids), question, extra_nodes)

In [36]:
res1 = list(tqdm(map(concepts_to_adj_matrices_2hop_all_pair__use_LM__Part1, qa_data), total=len(qa_data)))

100%|██████████| 259/259 [00:00<00:00, 96143.44it/s]


In [37]:
res1[0]

([928,
  3996,
  4958,
  10627,
  10998,
  12541,
  15816,
  41365,
  74204,
  79152,
  80083,
  195118,
  380952,
  387476],
 [172893],
 '[MASK] is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species.[SEP] pheromone.',
 [82112,
  80002,
  86218,
  179716,
  81011,
  52322,
  1152,
  2968,
  75321,
  150169,
  74204,
  928,
  68455,
  48142,
  12382,
  13342,
  20985,
  4519,
  2256,
  80083])

## concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3

In [38]:
# 考慮 全部的組合
def concepts2adj(node_ids):
    global id2relation
    cids = np.array(node_ids, dtype=np.int32)
    n_rel = len(id2relation)
    n_node = cids.shape[0]
    adj = np.zeros((n_rel, n_node, n_node), dtype=np.uint8)
    triplets = []
    for s in range(n_node):
        for t in range(n_node):
            s_c, t_c = cids[s], cids[t]
            if cpnet.has_edge(s_c, t_c):
                for e_attr in cpnet[s_c][t_c].values():
                    if e_attr['rel'] >= 0 and e_attr['rel'] < n_rel:
                        triplets.append([int(e_attr['rel']), int(s_c), int(t_c), e_attr['weight']])
                        adj[e_attr['rel']][s][t] = 1
    # cids += 1  # note!!! index 0 is reserved for padding
    adj = coo_matrix(adj.reshape(-1, n_node))
    return adj, cids, triplets

In [39]:
def concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3(data):
    qc_ids, ac_ids, question, extra_nodes = data
    # schema_graph = qc_ids + ac_ids + extra_nodes # <== 考慮全部的組合
    schema_graph = qc_ids + extra_nodes # <== 考慮 qc_ids 跟 extra_nodes 的組合 (without_ans)
    adj, concepts, triplets = concepts2adj(schema_graph)
    return {'triplets': triplets}

In [40]:
res3 = list(tqdm(map(concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3, res1), total=len(res1)))

100%|██████████| 259/259 [00:00<00:00, 757.15it/s]


In [41]:
res3[0]['triplets']

[[15, 928, 12541, 1.0],
 [15, 928, 79152, 0.755],
 [13, 928, 12382, 2.0],
 [15, 928, 2256, 1.0],
 [15, 10998, 20985, 1.0],
 [15, 12541, 928, 1.0],
 [15, 12541, 928, 1.0],
 [15, 15816, 10998, 2.0],
 [15, 74204, 20985, 1.0],
 [15, 79152, 195118, 1.0],
 [15, 387476, 41365, 1.0],
 [5, 80002, 2968, 2.0],
 [5, 179716, 80002, 2.0],
 [15, 179716, 80002, 1.0],
 [15, 2968, 74204, 1.0],
 [15, 2968, 80002, 1.0],
 [15, 2968, 74204, 1.0],
 [15, 75321, 928, 1.0],
 [15, 75321, 928, 1.0],
 [15, 74204, 20985, 1.0],
 [15, 928, 12541, 1.0],
 [15, 928, 79152, 0.755],
 [13, 928, 12382, 2.0],
 [15, 928, 2256, 1.0],
 [5, 68455, 74204, 2.0],
 [5, 68455, 74204, 2.0],
 [5, 68455, 20985, 0.5],
 [5, 48142, 928, 2.0],
 [15, 48142, 928, 1.0],
 [5, 48142, 928, 2.0],
 [15, 48142, 928, 1.0],
 [15, 13342, 10998, 0.761],
 [15, 13342, 74204, 1.0],
 [15, 13342, 74204, 1.0],
 [7, 20985, 10998, 2.0],
 [5, 20985, 74204, 2.0],
 [15, 20985, 74204, 0.189],
 [5, 20985, 74204, 2.0],
 [15, 20985, 74204, 0.189],
 [15, 20985, 68455, 

In [42]:
len(res3)

259

In [43]:
res4 = []

for item in tqdm(res3):
    temp_list = []
    for triplets in item['triplets']:
        rel, source_node, target_node, weight = triplets
        relation = id2relation[rel]
        source = id2concept[source_node]
        target = id2concept[target_node]
        temp_list.append([relation, source, target, weight])
    res4.append(temp_list)

100%|██████████| 259/259 [00:00<00:00, 40906.94it/s]


In [44]:
len(res4)

259

In [45]:
res4[0]

[['relatedto', 'animal', 'animals', 1.0],
 ['relatedto', 'animal', 'species', 0.755],
 ['notdesires', 'animal', 'pain', 2.0],
 ['relatedto', 'animal', 'fish', 1.0],
 ['relatedto', 'use', 'drug', 1.0],
 ['relatedto', 'animals', 'animal', 1.0],
 ['relatedto', 'animals', 'animal', 1.0],
 ['relatedto', 'used', 'use', 2.0],
 ['relatedto', 'chemical', 'drug', 1.0],
 ['relatedto', 'species', 'specie', 1.0],
 ['relatedto', 'affects', 'affect', 1.0],
 ['isa', 'enzyme', 'catalyst', 2.0],
 ['isa', 'protease', 'enzyme', 2.0],
 ['relatedto', 'protease', 'enzyme', 1.0],
 ['relatedto', 'catalyst', 'chemical', 1.0],
 ['relatedto', 'catalyst', 'enzyme', 1.0],
 ['relatedto', 'catalyst', 'chemical', 1.0],
 ['relatedto', 'carbohydrate', 'animal', 1.0],
 ['relatedto', 'carbohydrate', 'animal', 1.0],
 ['relatedto', 'chemical', 'drug', 1.0],
 ['relatedto', 'animal', 'animals', 1.0],
 ['relatedto', 'animal', 'species', 0.755],
 ['notdesires', 'animal', 'pain', 2.0],
 ['relatedto', 'animal', 'fish', 1.0],
 ['i

In [46]:
output_path = 'augmentation/chatgpt_mlm_test_triplet(without_ans).json'

In [47]:
with open(output_path, 'w') as fout:
    json.dump(res4, fout)
print(f'data saved to {output_path}')

data saved to augmentation/chatgpt_mlm_test_triplet(without_ans).json


In [48]:
rel, source_node, target_node, weight = res3[0]['triplets'][0]
print('relation = ',id2relation[rel])
print('source_node = ',id2concept[source_node])
print('target_node = ',id2concept[target_node])
print('weight = ',weight)

relation =  relatedto
source_node =  animal
target_node =  animals
weight =  1.0
