## Convert MCQ Dataset to Grounded Type (Original Data with Entities)

In [1]:
import spacy
from spacy.matcher import Matcher
import nltk
import string
import json
import re
import sys
import uuid
from tqdm import tqdm

In [None]:
nltk.download('stopwords', quiet=True)
nltk_stopwords = nltk.corpus.stopwords.words('english')

In [2]:
dg_file = './data/mcq/total_new_cleaned_train.json'
output_file = './data/mcq/grounded/train.grounded.json'
CPNET_VOCAB = './data/cpnet/concept.txt'
PATTERN_PATH = './data/cpnet/matcher_patterns.json'

In [3]:
with open(dg_file, 'r') as f:
    data = json.load(f)

## Replace **blank** with distractor in statment and label false

In [4]:
for item in data:
    statements = []

    sentence_text = item['sentence']
    distractors = item['distractors']
    answer_text = item['answer']

    stem = sentence_text.replace('**blank**',answer_text)
    statements.append({'label':True,'stem':stem})
    for distractor in distractors:
        stem = sentence_text.replace('**blank**',distractor)
        statements.append({'label':False,'stem':stem})
    item['statements'] = statements

In [5]:
data[0]

{'answer': 'gravity',
 'distractors': ['friction', 'erosion', 'magnetism'],
 'sentence': '**blank** causes rocks to roll downhill',
 'statements': [{'label': True,
   'stem': 'gravity causes rocks to roll downhill'},
  {'label': False, 'stem': 'friction causes rocks to roll downhill'},
  {'label': False, 'stem': 'erosion causes rocks to roll downhill'},
  {'label': False, 'stem': 'magnetism causes rocks to roll downhill'}]}

## Ground

In [6]:
nlp = None
matcher = None

In [7]:
def load_cpnet_vocab(cpnet_vocab_path):
    with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
        cpnet_vocab = [l.strip() for l in fin]
    cpnet_vocab = [c.replace("_", " ") for c in cpnet_vocab]
    return cpnet_vocab

In [8]:
global PATTERN_PATH, CPNET_VOCAB
if PATTERN_PATH is None:
    PATTERN_PATH = pattern_path
    CPNET_VOCAB = load_cpnet_vocab(cpnet_vocab_path)

In [9]:
sents = []
answers = []
distractors = []

for item in data:

    sentence_text = item['sentence']
    distractors_list = item['distractors']
    answer_text = item['answer']

    statement = sentence_text.replace('**blank**',answer_text)
    sents.append(statement)
    answers.append(answer_text)
    distractors.append(distractors_list)

In [10]:
print(sents[0])
print(answers[0])
print(distractors[0])

gravity causes rocks to roll downhill
gravity
['friction', 'erosion', 'magnetism']


In [11]:
print(len(sents))

2321


In [12]:
print(len(answers))

2321


In [13]:
print(len(distractors))

2321


### find the entity in each sentence

In [27]:
def lemmatize(nlp, concept):
    doc = nlp(concept.replace("_", " "))
    lcs = set()
    lcs.add("_".join([token.lemma_ for token in doc]))  # all lemma
    return lcs

In [28]:
def ground_qa_pair(qa_pair):
    global nlp, matcher
    if nlp is None or matcher is None:
        nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'textcat'])
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        matcher = load_matcher(nlp, PATTERN_PATH)

    s, a = qa_pair
    all_concepts = ground_mentioned_concepts(nlp, matcher, s, a)
    answer_concepts = ground_mentioned_concepts(nlp, matcher, a)
    question_concepts = all_concepts - answer_concepts
    if len(question_concepts) == 0:
        question_concepts = hard_ground(nlp, s, CPNET_VOCAB)  # not very possible

    if len(answer_concepts) == 0:
        answer_concepts = hard_ground(nlp, a, CPNET_VOCAB)  # some case

    # question_concepts = question_concepts -  answer_concepts
    question_concepts = sorted(list(question_concepts))
    answer_concepts = sorted(list(answer_concepts))
    return {"sent": s, "ans": a, "qc": question_concepts, "ac": answer_concepts}

In [29]:
def hard_ground(nlp, sent, cpnet_vocab):
    sent = sent.lower()
    doc = nlp(sent)
    res = set()
    for t in doc:
        if t.lemma_ in cpnet_vocab:
            res.add(t.lemma_)
    sent = " ".join([t.text for t in doc])
    if sent in cpnet_vocab:
        res.add(sent)
    try:
        assert len(res) > 0
    except Exception:
        print(f"for {sent}, concept not found in hard grounding.")
    return res

In [30]:
def load_matcher(nlp, pattern_path):
    with open(pattern_path, "r", encoding="utf8") as fin:
        all_patterns = json.load(fin)

    matcher = Matcher(nlp.vocab)
    for concept, pattern in all_patterns.items():
        matcher.add(concept, None, pattern)
    return matcher

In [31]:
def ground_mentioned_concepts(nlp, matcher, s, ans=None):

    s = s.lower()
    doc = nlp(s)
    matches = matcher(doc)

    mentioned_concepts = set()
    span_to_concepts = {}

    if ans is not None:
        ans_matcher = Matcher(nlp.vocab)
        ans_words = nlp(ans)
        # print(ans_words)
        ans_matcher.add(ans, None, [{'TEXT': token.text.lower()} for token in ans_words])

        ans_match = ans_matcher(doc)
        ans_mentions = set()
        for _, ans_start, ans_end in ans_match:
            ans_mentions.add((ans_start, ans_end))

    for match_id, start, end in matches:
        if ans is not None:
            if (start, end) in ans_mentions:
                continue

        span = doc[start:end].text  # the matched span

        # a word that appears in answer is not considered as a mention in the question
        # if len(set(span.split(" ")).intersection(set(ans.split(" ")))) > 0:
        #     continue
        original_concept = nlp.vocab.strings[match_id]
        original_concept_set = set()
        original_concept_set.add(original_concept)

        # print("span", span)
        # print("concept", original_concept)
        # print("Matched '" + span + "' to the rule '" + string_id)

        # why do you lemmatize a mention whose len == 1?

        if len(original_concept.split("_")) == 1:
            # tag = doc[start].tag_
            # if tag in ['VBN', 'VBG']:

            original_concept_set.update(lemmatize(nlp, nlp.vocab.strings[match_id]))

        if span not in span_to_concepts:
            span_to_concepts[span] = set()

        span_to_concepts[span].update(original_concept_set)

    for span, concepts in span_to_concepts.items():
        concepts_sorted = list(concepts)
        # print("span:")
        # print(span)
        # print("concept_sorted:")
        # print(concepts_sorted)
        concepts_sorted.sort(key=len)

        # mentioned_concepts.update(concepts_sorted[0:2])

        shortest = concepts_sorted[0:3]

        for c in shortest:
            if c in blacklist:
                continue

            # a set with one string like: set("like_apples")
            lcs = lemmatize(nlp, c)
            intersect = lcs.intersection(shortest)
            if len(intersect) > 0:
                mentioned_concepts.add(list(intersect)[0])
            else:
                mentioned_concepts.add(c)

        # if a mention exactly matches with a concept

        exact_match = set([concept for concept in concepts_sorted if concept.replace("_", " ").lower() == span.lower()])
        # print("exact match:")
        # print(exact_match)
        assert len(exact_match) < 2
        mentioned_concepts.update(exact_match)

    return mentioned_concepts

In [32]:
blacklist = set(["-PRON-", "actually", "likely", "possibly", "want",
                 "make", "my", "someone", "sometimes_people", "sometimes", "would", "want_to",
                 "one", "something", "sometimes", "everybody", "somebody", "could", "could_be"
                 ])

In [33]:
res = []

In [34]:
res = list(tqdm(map(ground_qa_pair, zip(sents, answers)), total=len(sents)))

 13%|█▎        | 313/2321 [02:09<16:30,  2.03it/s]

for co2, concept not found in hard grounding.


 16%|█▌        | 371/2321 [02:32<12:12,  2.66it/s]

for co2, concept not found in hard grounding.


 17%|█▋        | 386/2321 [02:40<16:38,  1.94it/s]

for cho, concept not found in hard grounding.


 17%|█▋        | 392/2321 [02:42<11:21,  2.83it/s]

for gcm3, concept not found in hard grounding.


 20%|█▉        | 462/2321 [03:09<13:45,  2.25it/s]

for $ 150, concept not found in hard grounding.


 20%|█▉        | 463/2321 [03:10<14:23,  2.15it/s]

for $ 300, concept not found in hard grounding.


 20%|█▉        | 464/2321 [03:10<15:33,  1.99it/s]

for $ 17, concept not found in hard grounding.


 20%|██        | 465/2321 [03:11<18:01,  1.72it/s]

for $ 25, concept not found in hard grounding.


 20%|██        | 466/2321 [03:12<18:44,  1.65it/s]

for $ 125, concept not found in hard grounding.


 20%|██        | 470/2321 [03:14<17:47,  1.73it/s]

for $ 25,000, concept not found in hard grounding.


 20%|██        | 472/2321 [03:15<17:46,  1.73it/s]

for $ 10,000, concept not found in hard grounding.


 20%|██        | 473/2321 [03:16<17:04,  1.80it/s]

for $ 40,000, concept not found in hard grounding.


 20%|██        | 474/2321 [03:16<19:36,  1.57it/s]

for $ 2,100, concept not found in hard grounding.


 21%|██        | 476/2321 [03:17<15:29,  1.99it/s]

for p3, concept not found in hard grounding.


 21%|██        | 477/2321 [03:17<12:57,  2.37it/s]

for $ 7.50, concept not found in hard grounding.


 21%|██        | 478/2321 [03:19<23:24,  1.31it/s]

for $ 140,000, concept not found in hard grounding.


 21%|██        | 484/2321 [03:23<23:11,  1.32it/s]

for $ 90,000, concept not found in hard grounding.


 21%|██        | 486/2321 [03:24<24:18,  1.26it/s]

for $ 1.99, concept not found in hard grounding.


 21%|██        | 487/2321 [03:25<22:18,  1.37it/s]

for 2 %, concept not found in hard grounding.


 21%|██        | 489/2321 [03:26<19:00,  1.61it/s]

for 1%-3 %, concept not found in hard grounding.


 21%|██        | 490/2321 [03:26<17:55,  1.70it/s]

for $ 13,333, concept not found in hard grounding.


 21%|██        | 491/2321 [03:27<19:15,  1.58it/s]

for $ 100,000, concept not found in hard grounding.


 21%|██        | 492/2321 [03:28<20:27,  1.49it/s]

for 2 %, concept not found in hard grounding.


 22%|██▏       | 507/2321 [03:35<13:23,  2.26it/s]

for 1920s, concept not found in hard grounding.


 22%|██▏       | 508/2321 [03:36<13:41,  2.21it/s]

for 1,500, concept not found in hard grounding.


 22%|██▏       | 520/2321 [03:39<08:27,  3.55it/s]

for 1850s, concept not found in hard grounding.


 23%|██▎       | 527/2321 [03:42<10:56,  2.73it/s]

for hyperpluralist, concept not found in hard grounding.


 23%|██▎       | 529/2321 [03:43<11:29,  2.60it/s]

for 20,000, concept not found in hard grounding.


 24%|██▎       | 551/2321 [03:51<09:54,  2.98it/s]

for 4,000,000, concept not found in hard grounding.


 24%|██▍       | 558/2321 [03:54<09:14,  3.18it/s]

for 1970s, concept not found in hard grounding.


 24%|██▍       | 560/2321 [03:54<10:27,  2.81it/s]

for $ 46,000, concept not found in hard grounding.


 24%|██▍       | 564/2321 [03:57<13:27,  2.18it/s]

for 10,000, concept not found in hard grounding.


 25%|██▍       | 580/2321 [04:03<14:39,  1.98it/s]

for montesqiueu, concept not found in hard grounding.


 28%|██▊       | 655/2321 [04:41<11:42,  2.37it/s]

for 15 %, concept not found in hard grounding.


 28%|██▊       | 661/2321 [04:44<10:55,  2.53it/s]

for ch\'in, concept not found in hard grounding.


 29%|██▊       | 663/2321 [04:44<09:35,  2.88it/s]

for 10 %, concept not found in hard grounding.


 29%|██▉       | 679/2321 [04:50<11:10,  2.45it/s]

for all, concept not found in hard grounding.


 30%|██▉       | 687/2321 [04:53<11:17,  2.41it/s]

for wants, concept not found in hard grounding.


 34%|███▍      | 791/2321 [05:55<13:57,  1.83it/s]

for more - for - the - same, concept not found in hard grounding.


 34%|███▍      | 798/2321 [06:01<20:56,  1.21it/s]

for more - for - more, concept not found in hard grounding.


 39%|███▊      | 895/2321 [06:58<12:38,  1.88it/s]

for preapproach, concept not found in hard grounding.


 39%|███▉      | 905/2321 [07:03<12:07,  1.95it/s]

for b2c, concept not found in hard grounding.


 39%|███▉      | 906/2321 [07:04<12:10,  1.94it/s]

for c2b, concept not found in hard grounding.


 40%|███▉      | 918/2321 [07:14<24:22,  1.04s/it]

for nicher, concept not found in hard grounding.


 40%|███▉      | 923/2321 [07:17<14:40,  1.59it/s]

for own, concept not found in hard grounding.


 44%|████▎     | 1014/2321 [07:57<08:56,  2.43it/s]

for ebbinghaus, concept not found in hard grounding.


 45%|████▌     | 1052/2321 [08:13<07:45,  2.73it/s]

for i d, concept not found in hard grounding.


 45%|████▌     | 1053/2321 [08:14<08:08,  2.60it/s]

for i d, concept not found in hard grounding.


 47%|████▋     | 1084/2321 [08:27<08:53,  2.32it/s]

for locura, concept not found in hard grounding.


 47%|████▋     | 1101/2321 [08:36<12:09,  1.67it/s]

for pseudo-, concept not found in hard grounding.


 48%|████▊     | 1105/2321 [08:38<09:29,  2.14it/s]

for maslow, concept not found in hard grounding.


 51%|█████     | 1182/2321 [09:08<06:21,  2.98it/s]

for corticalization, concept not found in hard grounding.


 51%|█████▏    | 1193/2321 [09:12<07:28,  2.52it/s]

for pinel, concept not found in hard grounding.


 53%|█████▎    | 1219/2321 [09:20<04:24,  4.16it/s]

for 2,000, concept not found in hard grounding.


 53%|█████▎    | 1230/2321 [09:22<04:41,  3.87it/s]

for alektorphobic, concept not found in hard grounding.


 53%|█████▎    | 1236/2321 [09:24<04:35,  3.93it/s]

for u.s, concept not found in hard grounding.


 53%|█████▎    | 1240/2321 [09:25<05:27,  3.30it/s]

for 1,000, concept not found in hard grounding.


 54%|█████▍    | 1256/2321 [09:29<04:16,  4.16it/s]

for anheuser - busch, concept not found in hard grounding.


 56%|█████▋    | 1306/2321 [09:44<05:18,  3.19it/s]

for one, concept not found in hard grounding.


 56%|█████▋    | 1310/2321 [09:45<04:27,  3.77it/s]

for one, concept not found in hard grounding.


 57%|█████▋    | 1316/2321 [09:47<05:19,  3.14it/s]

for one, concept not found in hard grounding.


 58%|█████▊    | 1345/2321 [09:54<03:58,  4.09it/s]

for 1970s, concept not found in hard grounding.


 62%|██████▏   | 1430/2321 [10:18<03:23,  4.37it/s]

for silcon, concept not found in hard grounding.


 68%|██████▊   | 1577/2321 [10:57<03:17,  3.77it/s]

for one, concept not found in hard grounding.


 78%|███████▊  | 1806/2321 [12:05<01:57,  4.39it/s]

for glycolsis, concept not found in hard grounding.


 80%|████████  | 1866/2321 [12:23<02:18,  3.28it/s]

for one, concept not found in hard grounding.


 81%|████████  | 1872/2321 [12:25<02:15,  3.31it/s]

for plasmodesmata, concept not found in hard grounding.


 93%|█████████▎| 2169/2321 [13:58<00:48,  3.14it/s]

for arcualia, concept not found in hard grounding.


 98%|█████████▊| 2278/2321 [14:33<00:15,  2.78it/s]

for adaptatioins, concept not found in hard grounding.


 99%|█████████▉| 2302/2321 [14:41<00:05,  3.23it/s]

for sorroundings, concept not found in hard grounding.


100%|██████████| 2321/2321 [14:46<00:00,  2.62it/s]


In [35]:
res[1]

{'sent': "meter of measurement describes an object 's length",
 'ans': 'meter',
 'qc': ['describe',
  'describes',
  'length',
  'measurement',
  'object',
  'of_measurement'],
 'ac': ['meter']}

In [36]:
def prune(data, cpnet_vocab_path):
    # reload cpnet_vocab
    with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
        cpnet_vocab = [l.strip() for l in fin]

    prune_data = []
    for item in tqdm(data):
        qc = item["qc"]
        prune_qc = []
        for c in qc:
            if c[-2:] == "er" and c[:-2] in qc:
                continue
            if c[-1:] == "e" and c[:-1] in qc:
                continue
            have_stop = False
            # remove all concepts having stopwords, including hard-grounded ones
            for t in c.split("_"):
                if t in nltk_stopwords:
                    have_stop = True
            if not have_stop and c in cpnet_vocab:
                prune_qc.append(c)

        ac = item["ac"]
        prune_ac = []
        for c in ac:
            if c[-2:] == "er" and c[:-2] in ac:
                continue
            if c[-1:] == "e" and c[:-1] in ac:
                continue
            all_stop = True
            for t in c.split("_"):
                if t not in nltk_stopwords:
                    all_stop = False
            if not all_stop and c in cpnet_vocab:
                prune_ac.append(c)

        try:
            assert len(prune_ac) > 0 and len(prune_qc) > 0
        except Exception as e:
            pass
            # print("In pruning")
            # print(prune_qc)
            # print(prune_ac)
            # print("original:")
            # print(qc)
            # print(ac)
            # print()
        item["qc"] = prune_qc
        item["ac"] = prune_ac

        prune_data.append(item)
    return prune_data

In [37]:
cpnet_vocab_path = CPNET_VOCAB

In [38]:
res = prune(res, cpnet_vocab_path)

  0%|          | 0/2321 [00:00<?, ?it/s]


NameError: name 'nltk_stopwords' is not defined

In [None]:
res[0]

In [None]:
with open(output_path, 'w') as fout:
    for dic in res:
        fout.write(json.dumps(dic) + '\n')