In [2]:
from transformers import pipeline
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [3]:
question_answerer = pipeline("question-answering", model='/scratch/scratch8/madhurjindal/ACS-QG-Scratch/models/distilbert-base-cased-distilled-squad')

In [4]:
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. 
If you would like to fine-tune a model on a SQuAD task, 
you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""

result = question_answerer(question="What is a good example of a question answering dataset?",     context=context)
print(
f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160


In [5]:
question_answerer(question=["What is a good example of a question answering dataset?", "What is a good example of a question answering dataset?"], context=[context, context])

[{'score': 0.5152307152748108,
  'start': 147,
  'end': 160,
  'answer': 'SQuAD dataset'},
 {'score': 0.5152307152748108,
  'start': 147,
  'end': 160,
  'answer': 'SQuAD dataset'}]

In [6]:
question_answerer.tokenizer(["Hello I am Madhur", "Hello you"])

{'input_ids': [[101, 8667, 146, 1821, 10779, 26033, 102], [101, 8667, 1128, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]]}

In [7]:
def get_tokens(text):
    return question_answerer.tokenizer(text)['input_ids']

In [8]:
get_tokens("Hello you")

[101, 8667, 1128, 102]

In [9]:
data = pd.read_csv("/scratch/scratch8/madhurjindal/ACS-QG-Scratch/Datasets/processed/SQuAD2.0/train.qa.0_10000.qg.generated.gpt2.txt", sep="\t", index_col=0)

In [10]:
data.question.to_list()

["Who is Beyoncé's mom?",
 "What is the middle name of Beyoncé's mother?",
 'What kind of musician does Beyoncé have?',
 'Which activity is Beyonce an avid singer of?',
 "Name the date of Beyonce's first public performance?",
 "What was the date of Beyonce's first public appearance as a lesbian?",
 "Describe the date of Beyonce's American debut?",
 'On what date was Beyonce born?',
 "What is Beyonce's occupation?",
 'Which job does Beyonce have?',
 "What is Beyonce's occupation?",
 'Which career path does Beyonce take?',
 "What is Beyoncé's middle name?",
 'When was she born?',
 "What were Beyoncé's childhood competitions in the 1990s?",
 'How did Beyonce perform in the 1990s?',
 'What did Beyoncé grow up in?',
 'How did Beyoncé become a lead singer in the 1990s?',
 'What city is Beyonce from?',
 "Who was Beyoncé's mother raised in?",
 'What city is Beyoncé from?',
 'Who was Beyonce raised in?',
 'When did Beyonce become a singer?',
 'What decade did Beyonce become popular for R&B?',
 

In [11]:
gen_ans= question_answerer(question=data.question.to_list(), context=data.paragraph.to_list())
gen_ans = pd.DataFrame(gen_ans)

In [12]:
gen_ans

Unnamed: 0,score,start,end,answer
0,0.574629,0,30,Beyoncé Giselle Knowles-Carter
1,0.513843,8,30,Giselle Knowles-Carter
2,0.170980,98,116,"singer, songwriter"
3,0.348507,106,116,songwriter
4,0.789573,64,81,"September 4, 1981"
...,...,...,...,...
267,0.968048,73,88,Destiny's Child
268,0.977698,73,88,Destiny's Child
269,0.817540,25,32,Solange
270,0.997738,25,32,Solange


In [13]:
data['qa_ans'] = gen_ans['answer']
data['qa_ans_score'] = gen_ans['score']

In [14]:
data

Unnamed: 0,pid,sid,question,answer,paragraph,ans_start,clue,clue_start,ques_type,qa_ans,qa_ans_score
0,0,0,Who is Beyoncé's mom?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,),56,Who,Beyoncé Giselle Knowles-Carter,0.574629
1,0,1,What is the middle name of Beyoncé's mother?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,),56,What,Giselle Knowles-Carter,0.513843
2,0,2,What kind of musician does Beyoncé have?,record,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,118,an,86,What,"singer, songwriter",0.170980
3,0,3,Which activity is Beyonce an avid singer of?,record,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,118,an,86,Which,songwriter,0.348507
4,0,4,Name the date of Beyonce's first public perfor...,"September 4, 1981",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,64,),56,Other,"September 4, 1981",0.789573
...,...,...,...,...,...,...,...,...,...,...,...
267,19,267,What is Beyonce's younger sister Beyonce a mem...,Solange,Beyoncé's younger sister Solange is also a sin...,25,a singer,41,What,Destiny's Child,0.968048
268,19,268,Who is Beyonce's sister Solange a member of?,Solange,Beyoncé's younger sister Solange is also a sin...,25,a former member,54,Who,Destiny's Child,0.977698
269,19,269,Beyonce was a former member of what Destiny's ...,Solange,Beyoncé's younger sister Solange is also a sin...,25,a former member,54,What,Solange,0.817540
270,19,270,What is Beyoncé's sister Beyonce a singer?,Beyoncé's younger sister Solange,Beyoncé's younger sister Solange is also a sin...,0,a singer,41,What,Solange,0.997738


In [15]:
get_tokens(data.qa_ans.to_list())

[[101, 24041, 144, 22080, 25384, 118, 5007, 102],
 [101, 144, 22080, 25384, 118, 5007, 102],
 [101, 2483, 117, 5523, 102],
 [101, 5523, 102],
 [101, 1347, 125, 117, 2358, 102],
 [101, 1347, 125, 117, 2358, 102],
 [101, 1347, 125, 117, 2358, 102],
 [101, 1347, 125, 117, 2358, 102],
 [101, 2483, 117, 5523, 117, 1647, 2451, 102],
 [101, 1647, 2451, 102],
 [101, 2483, 117, 5523, 117, 1647, 2451, 102],
 [101, 2483, 117, 5523, 117, 1647, 2451, 102],
 [101, 144, 22080, 25384, 118, 5007, 102],
 [101, 1347, 125, 117, 2358, 102],
 [101, 4241, 1105, 5923, 102],
 [101,
  1730,
  2483,
  1104,
  155,
  111,
  139,
  1873,
  118,
  1372,
  16784,
  112,
  188,
  6405,
  102],
 [101, 4666, 102],
 [101, 16784, 112, 188, 6405, 102],
 [101, 4666, 102],
 [101, 4666, 117, 2245, 102],
 [101, 4666, 102],
 [101, 4666, 117, 2245, 102],
 [101, 1523, 3281, 102],
 [101, 3281, 102],
 [101, 4666, 117, 2245, 102],
 [101, 4666, 117, 2245, 102],
 [101, 4666, 117, 2245, 102],
 [101, 2245, 102],
 [101, 2268, 15841, 111

In [16]:
from collections import Counter

def f1_score(pred_text, targ_text):
    pred_tokens = get_tokens(pred_text)
    targ_tokens = get_tokens(targ_text)
    print(pred_tokens)
    common = Counter(pred_tokens) & Counter(targ_tokens)
    num_com = sum(common.values())
    if len(pred_tokens) == 0 or len(targ_tokens)==0:
        # If either is no-answer, then f1 is 1 if they agree, 0 otherwise
        return int(pred_tokens==targ_tokens)

    if num_com==0:
        return 0
    precision = 1.0 * num_com/len(pred_tokens)
    recall = 1.0 * num_com/len(targ_tokens)
    f1 = (2 * precision * recall)/ (precision + recall)
    return f1

In [17]:
f1_scores = list(map(lambda x: f1_score(*x), zip(data.answer.to_list(), data.qa_ans.to_list())))

[101, 24041, 144, 22080, 25384, 118, 5007, 113, 120, 100, 120, 17775, 118, 162, 11414, 118, 1474, 114, 102]
[101, 24041, 144, 22080, 25384, 118, 5007, 113, 120, 100, 120, 17775, 118, 162, 11414, 118, 1474, 114, 102]
[101, 1647, 102]
[101, 1647, 102]
[101, 1347, 125, 117, 2358, 102]
[101, 1347, 125, 117, 2358, 102]
[101, 1347, 125, 117, 2358, 102]
[101, 1347, 125, 117, 2358, 102]
[101, 2451, 102]
[101, 2451, 102]
[101, 2451, 102]
[101, 2451, 102]
[101, 125, 102]
[101, 125, 102]
[101, 1672, 4241, 1105, 5923, 6025, 102]
[101, 1672, 4241, 1105, 5923, 6025, 102]
[101, 1672, 4241, 1105, 5923, 6025, 102]
[101, 1672, 4241, 1105, 5923, 6025, 102]
[101, 4666, 117, 2245, 102]
[101, 4666, 117, 2245, 102]
[101, 4666, 117, 2245, 102]
[101, 4666, 117, 2245, 102]
[101, 1103, 1523, 3281, 102]
[101, 1103, 1523, 3281, 102]
[101, 2245, 102]
[101, 2245, 102]
[101, 2245, 102]
[101, 2245, 102]
[101, 1141, 102]
[101, 1141, 102]
[101, 1141, 102]
[101, 1141, 102]
[101, 1103, 1362, 112, 188, 102]
[101, 1103, 136

In [18]:
data['qa_f1_score'] = f1_scores

In [19]:
data

Unnamed: 0,pid,sid,question,answer,paragraph,ans_start,clue,clue_start,ques_type,qa_ans,qa_ans_score,qa_f1_score
0,0,0,Who is Beyoncé's mom?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,),56,Who,Beyoncé Giselle Knowles-Carter,0.574629,0.592593
1,0,1,What is the middle name of Beyoncé's mother?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0,),56,What,Giselle Knowles-Carter,0.513843,0.538462
2,0,2,What kind of musician does Beyoncé have?,record,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,118,an,86,What,"singer, songwriter",0.170980,0.500000
3,0,3,Which activity is Beyonce an avid singer of?,record,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,118,an,86,Which,songwriter,0.348507,0.666667
4,0,4,Name the date of Beyonce's first public perfor...,"September 4, 1981",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,64,),56,Other,"September 4, 1981",0.789573,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
267,19,267,What is Beyonce's younger sister Beyonce a mem...,Solange,Beyoncé's younger sister Solange is also a sin...,25,a singer,41,What,Destiny's Child,0.968048,0.363636
268,19,268,Who is Beyonce's sister Solange a member of?,Solange,Beyoncé's younger sister Solange is also a sin...,25,a former member,54,Who,Destiny's Child,0.977698,0.363636
269,19,269,Beyonce was a former member of what Destiny's ...,Solange,Beyoncé's younger sister Solange is also a sin...,25,a former member,54,What,Solange,0.817540,1.000000
270,19,270,What is Beyoncé's sister Beyonce a singer?,Beyoncé's younger sister Solange,Beyoncé's younger sister Solange is also a sin...,0,a singer,41,What,Solange,0.997738,0.666667


In [22]:
data[data.qa_f1_score >=0.9].to_csv("/scratch/scratch8/madhurjindal/ACS-QG-Scratch/Datasets/processed/SQuAD2.0/train.qa.0_10000.qg.generated.gpt2.qa.txt", sep="\t")