# Question generation tests
Let's test how well we can generate questions for the "clean" CNN data with crowdsourced questions and the noisy NYT data with user-generated questions.

In [1]:
## load data
import pandas as pd
import os
import re
cnn_article_dir = '../../data/CNN_articles/cnn/stories/'
cnn_article_files = list(map(lambda x: os.path.join(cnn_article_dir, x), os.listdir(cnn_article_dir)))
cnn_article_data = pd.DataFrame(list(map(lambda x: ' '.join(list(map(lambda x: x.strip(), open(x, 'r').readlines()))), cnn_article_files)),
                                columns=['article_text'])
article_id_matcher = re.compile('[a-z0-9]+(?=\.story)')
cnn_article_ids = list(map(lambda x: article_id_matcher.search(x).group(0), cnn_article_files))
cnn_article_data = cnn_article_data.assign(**{
    'article_id' : cnn_article_ids
})
display(cnn_article_data.head())

Unnamed: 0,article_text,article_id
0,(CNN) -- It's been a busy few days in the worl...,23f7ab281ef416e1cdcd9c6fc359278a40a6ba34
1,Ten loveable mutts rescued from the streets of...,a8ac73078f8774a5addac799a22d0634278e79d1
2,Ken Henggeler poured his grief into the thing ...,022713fd20ac0ffd0ddccec6d9389beedd681893
3,Tokyo (CNN) -- Japan's prime minister vowed to...,d62d6364625d1522c55024486ec25851687935e3
4,(CNN) -- The leadership of North Korea appeare...,911597eecfe5d00d39de94f17c2a8400fcb4b994


In [34]:
cnn_question_file = '../../data/CNN_articles/cnn/newsqa-data-v1/newsqa-data-v1.csv'
full_cnn_question_data = pd.read_csv(cnn_question_file, sep=',', index_col=False, usecols=['story_id', 'question', 'is_answer_absent', 'is_question_bad', 'answer_char_ranges'])
full_cnn_question_data.rename(columns={'story_id' : 'article_id'}, inplace=True)
# get article IDs
cnn_article_id_matcher = re.compile('[0-9a-zA-Z]+(?=\.story)')
full_cnn_question_data = full_cnn_question_data.assign(**{
    'article_id' : full_cnn_question_data.loc[:, 'article_id'].apply(lambda x: cnn_article_id_matcher.search(x).group(0))
})
# clean answers
full_cnn_question_data = full_cnn_question_data.assign(**{
    'answer_char_ranges' : full_cnn_question_data.loc[:, 'answer_char_ranges'].apply(lambda x: list(filter(lambda x: x!='None', x.split('|')))),
})

def extract_char_range(txt):
    txt_splits = txt.split(',')
    char_range = list(map(lambda x: list(map(lambda y: int(y), x.split(':'))), txt_splits))
    return char_range
def extract_all_char_ranges(txt):
    char_ranges = list(map(lambda y: extract_char_range(y), txt))
    # flatten lol
    flat_char_ranges = []
    for char_range_i in char_ranges:
        flat_char_ranges += char_range_i
    return flat_char_ranges
full_cnn_question_data = full_cnn_question_data.assign(**{
    'clean_answer_char_ranges' : full_cnn_question_data.loc[:, 'answer_char_ranges'].apply(lambda x: extract_all_char_ranges(x)),
})
# get rid of unclear questions
# NOTE: we may need the different "bad" ratings later when testing word overlap
full_cnn_question_data = full_cnn_question_data[full_cnn_question_data.loc[:, 'is_question_bad']!='?']
full_cnn_question_data = full_cnn_question_data.assign(**{'is_question_bad' : full_cnn_question_data.loc[:, 'is_question_bad'].astype(float)})
# remove bad questions
cnn_question_data = full_cnn_question_data[(full_cnn_question_data.loc[:, 'is_answer_absent']==0.) &
                                           (full_cnn_question_data.loc[:, 'is_question_bad']==0.)]
display(cnn_question_data.head())


Unnamed: 0,article_id,question,answer_char_ranges,is_answer_absent,is_question_bad,clean_answer_char_ranges
0,42d01e187213e86f5fe617fe32e716ff7fa3afc4,What was the amount of children murdered?,[294:297],0.0,0.0,"[[294, 297]]"
1,c48228a52f26aca65c31fad273e66164f047f292,Where was one employee killed?,"[34:60, 1610:1618, 34:60]",0.0,0.0,"[[34, 60], [1610, 1618], [34, 60]]"
2,c65ed85800e4535f4bbbfa2c34d7d9630358d303,who did say South Africa did not issue a visa ...,"[103:127, 114:127, 839:853]",0.0,0.0,"[[103, 127], [114, 127], [839, 853]]"
3,0cf66b646e9b32076513c050edf32a799200c3c2,How many years old was the businessman?,"[538:550, 538:550]",0.0,0.0,"[[538, 550], [538, 550]]"
4,13012604e3203c18df09289dfedd14cde67cf40b,What frightened the families?,"[690:742, 688:791, 630:646]",0.0,0.0,"[[690, 742], [688, 791], [630, 646]]"


In [35]:
## combine article/comment
cnn_article_question_data = pd.merge(cnn_question_data, cnn_article_data, on='article_id')
cnn_article_question_data = cnn_article_question_data[~cnn_article_question_data.loc[:, 'article_text'].apply(lambda x: type(x) is float and np.isnan(x))]
## get answers to validate questions
cnn_article_question_data = cnn_article_question_data.assign(**{
    
})
cnn_article_question_data = cnn_article_question_data.assign(**{
    'clean_answers' : cnn_article_question_data.apply(lambda x: list(map(lambda y: x.loc['article_text'][y[0]:y[1]], x.loc['clean_answer_char_ranges'])), axis=1)
})

In [36]:
print(f'{cnn_article_question_data.shape[0]} questions')

63672 questions


In [39]:
display(cnn_article_question_data.head(5))

Unnamed: 0,article_id,question,answer_char_ranges,is_answer_absent,is_question_bad,clean_answer_char_ranges,article_text,clean_answers
0,42d01e187213e86f5fe617fe32e716ff7fa3afc4,What was the amount of children murdered?,[294:297],0.0,0.0,"[[294, 297]]","NEW DELHI, India (CNN) -- A high court in nort...",[ict]
1,42d01e187213e86f5fe617fe32e716ff7fa3afc4,When was Pandher sentenced to death?,"[261:271, 258:271, 261:271]",0.0,0.0,"[[261, 271], [258, 271], [261, 271]]","NEW DELHI, India (CNN) -- A high court in nort...","[bruary. T, February. T, bruary. T]"
2,42d01e187213e86f5fe617fe32e716ff7fa3afc4,The court aquitted Moninder Singh Pandher of w...,"[26:33, 624:640]",0.0,0.0,"[[26, 33], [624, 640]]","NEW DELHI, India (CNN) -- A high court in nort...","[A high , murder of the 1]"
3,42d01e187213e86f5fe617fe32e716ff7fa3afc4,who was acquitted,"[195:218, 195:218]",0.0,0.0,"[[195, 218], [195, 218]]","NEW DELHI, India (CNN) -- A high court in nort...","[ninder Singh Pandher wa, ninder Singh Pandher..."
4,42d01e187213e86f5fe617fe32e716ff7fa3afc4,What was Moninder Singh Pandher acquitted for?,"[129:192, 129:151, 133:151]",0.0,0.0,"[[129, 192], [129, 151], [133, 151]]","NEW DELHI, India (CNN) -- A high court in nort...","[the killing of a teen in a case dubbed ""the h..."


### Train model
Let's try an actual trained QA model with all the bells/whistles first, downloaded from [here](https://github.com/patil-suraj/question_generation.git).

In [None]:
import sys
if('question_generation/' not in sys.path):
    sys.path.append('question_generation/')
## NOTE need to run pipeline once to download (expensive!!) models
from pipelines import pipeline
question_pipeline = pipeline('question-generation')

In [6]:
## test short text
text = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum \
and first released in 1991, Python's design philosophy emphasizes code \
readability with its notable use of significant whitespace."
text_question = question_pipeline(text)
print(text_question)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=627.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=791656.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=65.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=90.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=242013444.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=656.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=791656.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=65.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=90.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=242013376.0), HTML(value='')))






[{'answer': 'Python', 'question': 'What is an interpreted, high-level, general-purpose programming language?'}, {'answer': 'Guido van Rossum', 'question': 'Who created Python?'}]


In [40]:
test_cnn_article_question_data = cnn_article_question_data.iloc[0, :]
test_article = test_cnn_article_question_data.loc['article_text']
test_gold_question = test_cnn_article_question_data.loc['question']
print(f'{test_article[:300]}...')
test_article_questions = question_pipeline(test_article)
print(f'generated questions = {test_article_questions}')
print(f'gold question = {test_gold_question}')

NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."  Moninder Singh Pandher was sentenced to death by a lower court in February.  The teen was one of 19 victims...




generated questions = [{'answer': 'the house of horrors', 'question': 'What was the name of the case that was acquitted by a high court in northern India?'}, {'answer': 'February', 'question': 'When was Moninder Singh Pandher sentenced to death?'}, {'answer': '19', 'question': 'How many victims did Moninder Singh Pandher have?'}, {'answer': 'Sikandar B. Kochar', 'question': "Who was Moninder Singh Pandher's lawyer?"}, {'answer': 'Surinder Koli', 'question': "Who was Pandher's domestic employee?"}, {'answer': 'Koli', 'question': "Who's death sentence was upheld by the high court?"}, {'answer': 'Noida', 'question': "Where was Pandher's home found?"}, {'answer': 'house of horrors', 'question': 'What was the home of Moninder Singh Pandher called by the Indian media?'}, {'answer': 'co-accused', 'question': 'What was Pandher summoned during the trial?'}, {'answer': 'Australia', 'question': 'In what country was the teen raped and killed?'}, {'answer': '18', 'question': 'How many killings did 

Even though the wording is different, we see that the gold question is captured in at least one of the generated questions (`How many victims...`).

### Generate, evalute questions for articles

TODO: Let's try generating questions for all the articles and see how potentially useful they would be. They may be restricted to within-sentence reasoning and therefore not too complicated.