In [1]:
import nltk
import json
import spacy
from nltk.corpus import stopwords
from math import log
from collections import defaultdict, Counter
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
import re
import csv
from gensim.summarization import bm25

# Variables

In [2]:
OPEN_QUESTION_WORDS = ['what','who','whose','whom','where','when','why','how',
                       'which',"what's","who's","where's","how's"]
CLOSED_QUESTION_WORDS = ['is','are','am','was','were','do','does,','did','can',
                         'could','will','would','shall','should','have','has',
                         'had']

# Stop words
stop = set(stopwords.words('english'))

lmtz = WordNetLemmatizer()

with open('testing.json') as json_data:
    test = json.load(json_data)

with open('documents.json') as json_data:
    documents = json.load(json_data)

# Spacy toolkit
nlp = spacy.load('en_core_web_sm')

punc = set(punctuation)

In [3]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punc)

In [4]:
def lemmatize(token):
    lemma = lmtz.lemmatize(token, 'v')
    if lemma == token:
        lemma = lmtz.lemmatize(token, 'n')
    return lemma

        
def extract_term_freqs(doc):
    tfs = {}
    for token in nltk.word_tokenize(doc):
        lemma = lemmatize(token.lower())
        if lemma not in stop and lemma.isalpha():
            tfs[lemma] = tfs.get(lemma, 0) + 1
    return tfs


def compute_doc_freqs(doc_term_freqs):
    dfs = Counter()
    for tfs in doc_term_freqs.values():
        for term in tfs.keys():
            dfs[term] += 1
    return dfs


def query_vsm(query, index, k=4):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator.most_common(k)


# Find the question word
def get_qword(question):
    tokens = nltk.word_tokenize(question.lower())
    for token in tokens:
        if token in OPEN_QUESTION_WORDS:
            return token
    for token in tokens:
        if token in CLOSED_QUESTION_WORDS:
            return token
    return 'others'

In [5]:
# length of longest same sequences of keywords
def get_overlap(sent1, sent2):
    tokens1 = []
    tokens2 = []

    for token in nltk.word_tokenize(strip_punctuation(sent1.lower())):
        lemma = lemmatize(token)
        if lemma not in stop:
            tokens1.append(lemma)

    for token in nltk.word_tokenize(strip_punctuation(sent2.lower())):
        lemma = lemmatize(token)
        if lemma not in stop:
            tokens2.append(lemma)

    max = 0
    for i in range(len(tokens1)):
        for j in range(len(tokens2)):

            if tokens1[i] == tokens2[j]:
                length = 1

                ii = i + 1
                jj = j + 1
                while ii < len(tokens1) and jj < len(tokens2) and \
                        tokens1[ii] == tokens2[jj]:
                    ii += 1
                    jj += 1
                    length += 1

                if length > max:
                    max = length

    return max

# Write to test file

In [17]:
csvFile = open("high.csv", "w")
writer = csv.writer(csvFile)
header = ['id','answer']
writer.writerow(header)


case_count = 0
empty_count = 0
# test = [test[17]]
for test_case in test:
    question = test_case['question']
    docid = test_case['docid']

    # Convert doc into one string, then tokenize sentences
    corpus = ''
    for para in documents[docid]['text']:
        corpus += para + ' '

    # sentence as a document
    raw_docs = nltk.sent_tokenize(corpus)
    

    # TFIDF
#     doc_term_freqs = {}
#     for (id, raw_doc) in enumerate(raw_docs):
#         term_freqs = extract_term_freqs(raw_doc)
#         doc_term_freqs[id] = term_freqs
#     M = len(doc_term_freqs)

#     doc_freqs = compute_doc_freqs(doc_term_freqs)

#     vsm_inverted_index = defaultdict(list)
#     for docid, term_freqs in doc_term_freqs.items():
#         N = sum(term_freqs.values())
#         length = 0

#         # find tf*idf values and accumulate sum of squares
#         tfidf_values = []
#         for term, count in term_freqs.items():
#             tfidf = float(count) / N * log(M / float(doc_freqs[term]))
#             tfidf_values.append((term, tfidf))
#             length += tfidf ** 2

#         # normalise documents by length and insert into index
#         length = length ** 0.5
#         for term, tfidf in tfidf_values:
#             # inversion of the indexing, term -> (doc_id, score)
#             vsm_inverted_index[term].append([docid, tfidf / length])

#     for term, docids in vsm_inverted_index.items():
#         docids.sort()

#     terms = extract_term_freqs(question) 
#     results = query_vsm(terms, vsm_inverted_index)
    
    
    tokenized_sentence = []
    for each_sentence in raw_docs:
        filter_stop_word = []
        sentence_as_words = nltk.word_tokenize(each_sentence)
        for each_word in sentence_as_words:
            if each_word not in stop:
                filter_stop_word.append(each_word)
        
        tokenized_sentence.append(filter_stop_word)
        
    bm25Model = bm25.BM25(tokenized_sentence)
    average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
    
    query = []
    for word in nltk.word_tokenize(question):
        if word not in stop:
            query.append(word)
        
    scores = bm25Model.get_scores(query,average_idf)
    bm25_dic = Counter()
    
    sentence_id = 0
    for each_score in scores:
        bm25_dic[sentence_id] = each_score
        sentence_id += 1
    results = bm25_dic.most_common(3)


    # Step 2
    # Analyse question type
    qword = get_qword(question)

    # the word after question word, such as 'what value', 'which gender'
    next_token = ''

    qtype = ''

    # dependency parsing
    dep = ''

    # head word
    head = ''

    # head dependency
    head_dep = ''

    # subject, root, object
    nsubj = ''
    ROOT = ''
    dobj = ''

    # yes or no questions have two options
    closed_q_choices = ('', '')

    doc = nlp(question)

    tokens = nltk.word_tokenize(question.lower())

    # get next word
    if qword in tokens:
        if tokens.index(qword) < len(tokens) - 1:
            next_token = tokens[tokens.index(qword) + 1]

    # get structure of sentence
    for token in doc:
        if 'nsubj' in token.dep_:
            nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            dobj = lemmatize(strip_punctuation(token.text))

    # for noun (phrase) questions, get answer dependency
    for chunk in doc.noun_chunks:
        if qword in chunk.text:
            dep = chunk.root.dep_
            head = lemmatize(strip_punctuation(chunk.root.head.text))
            head_dep = chunk.root.head.dep_

    # determine answer type
    if 'stand for' in question or 'abbreviat' in question:
        qtype = 'abrv'

    elif qword in ['who',"who's",'whom','whose']:
        qtype = 'who'

    elif qword == 'when':
        qtype = 'when'

    elif qword in ['where',"where's"]:
        qtype = 'where'

    elif qword in ['how',"how's"]:
        if next_token == 'much':
            qtype = 'MONEY'
        elif next_token == 'many':
            qtype = 'CARDINAL'
        elif next_token == 'long':
            qtype = 'DATE'
        elif next_token in ['far','big','wide','deep','tall','high','fast','heavy']:
            qtype = 'QUANTITY'
        elif next_token in ['old','young']:
            qtype = 'DATE'
        elif next_token in ['often']:
            qtype = 'QUANTITY'
        elif next_token in ['does','did','do','have','has','had','should',
                              'can','could','will','would','must']:
            if dobj != '':
                qtype = 'adj'
            else:
                qtype = 'verb'

    elif qword in ['what', "what's", 'which']:
        if 'year'in tokens or \
                'day' in tokens or \
                'month' in tokens or \
                'era' in tokens or \
                'age' in tokens or \
                'century' in tokens or \
                'week' in tokens or \
                'period' in tokens or \
                'dynasty' in tokens:
            qtype = 'DATE'

        elif 'company' in tokens or \
                'organization' in tokens or \
                'organisation' in tokens or \
                'corporation' in tokens or \
                'institution' in tokens or \
                'university' in tokens or \
                'corporation' in tokens or \
                'association' in tokens or \
                'union' in tokens or \
                'agency' in tokens:
            qtype = 'ORG'

        elif 'city' in tokens or \
                'country' in tokens or \
                'state' in tokens or \
                'province' in tokens or \
                'county' in tokens:
            qtype = 'GPE'

        elif 'place' in tokens or \
                'river' in tokens or \
                'mountain' in tokens or \
                'ocean' in tokens or \
                'region' in tokens or \
                'area' in tokens or \
                'sea' in tokens or \
                'lake' in tokens or \
                'continent' in tokens or \
                'location' in tokens or \
                'forest' in tokens or \
                'jungle' in tokens:
            qtype = 'LOC'

        elif 'nationality' in tokens:
            qtype = 'NORP'

        elif 'building' in tokens or \
            'airport' in tokens or \
            'highway' in tokens or \
            'bridge' in tokens or \
            'harbour' in tokens or \
            'harbor' in tokens or \
            'port' in tokens or \
            'dam' in tokens:
            qtype = 'FACILITY'

        elif 'hurricane' in tokens or \
            'battle' in tokens or \
            'war' in tokens:
            qtype = 'EVENT'

        elif 'book' in tokens or \
            'novel' in tokens or \
            'song' in tokens or \
            'music' in tokens or \
            'painting' in tokens:
            qtype = 'WORK_OF_ART'

        elif 'language' in tokens or \
                'speak' in tokens:
            qtype = 'LANGUAGE'

        elif 'percentage' in tokens or 'percent' in tokens:
            qtype = 'PERCENT'
            
        elif 'frequency' in tokens:
            qtype = 'QUANTITY'
            
        elif 'value' in tokens or \
                'distance' in tokens or \
                'size' in tokens or \
                'length' in tokens or \
                'depth' in tokens or \
                'height' in tokens or \
                'density' in tokens or \
                'speed' in tokens or \
                'weight' in tokens or \
                'area' in tokens or \
                'temperature' in tokens or \
                'volume' in tokens:
            qtype = 'QUANTITY'

        elif 'number' in tokens:
            qtype = 'CARDINAL'

        elif 'price' in tokens:
            qtype = 'MONEY'

        elif 'name' in tokens:
            qtype = 'NE'

        else:
            # what...do type question
            tokens.remove(next_token)
            if 'do' in tokens:
                qtype = 'verb'
            else:
                qtype = 'noun'

    elif qword == 'why':
        qtype = 'why'

    elif qword in CLOSED_QUESTION_WORDS:
        qtype = 'closed'

        # answer is one of the 'or' options in the question
        if 'or' in tokens:
            index = tokens.index('or')
            prev1 = tokens[index - 1]
            next1 = tokens[index + 1]
            tag_tokens = nltk.pos_tag(tokens)

            tag = tag_tokens[index - 1][1]

            # if answer is a noun
            if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
                for chunk in doc.noun_chunks:
                    if prev1 in chunk.text:
                        first = chunk.text
                    if next1 in chunk.text:
                        second = chunk.text
                closed_q_choices = (first, second)
            else:
                closed_q_choices = (prev1, next1)
        else:
            qtype = 'others'

    # re-rank the sentences
    scores = {}
    for id, _ in results:
        sent = raw_docs[id]
        doc = nlp(sent)

        score = get_overlap(sent, question)

        if qtype == 'who':
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    score += 1

        elif qtype == 'when':
            for ent in doc.ents:
                if ent.label_ == 'TIME' or ent.label_ == "DATE":
                    score += 1

        elif qtype == 'where':
            for ent in doc.ents:
                if ent.label_ == 'GPE' or ent.label_ == "LOC":
                    score += 1

        elif qtype in ['LANGUAGE','WORK_OF_ART','EVENT','NORP','FACILITY',
                       'GPE','DATE','TIME','PERCENT','QUANTITY','CARDINAL',
                     'MONEY','PERSON','ORG','LOC']:
            
            for ent in doc.ents:
                if ent.label_ == qtype:
                    score += 1
                    
        elif qtype == 'NE':
            for ent in doc.ents:
                    score += 1

        elif qtype == 'adj':
            for token in doc:
                if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                    score += 1

        elif qtype == 'verb':
            for token in doc:
                if token.dep_ == 'ROOT':
                    score += 1

        elif qtype == 'closed':
            first = closed_q_choices[0]
            second = closed_q_choices[1]

            score += (first in sent) + (second in sent)

        elif qtype == 'why':
            if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:
                score += 1

        scores[id] = score

    rank = {}
    for id, sim in results:
        max_score = scores[max(scores, key=scores.get)]
        if max_score != 0:
            rank[id] = sim * 0.3 + (scores[id] / max_score * 0.7)
        else:
            rank[id] = sim
    
    # sentence with highest rank
    index = max(rank, key=rank.get)
    sent = raw_docs[index]
    doc = nlp(sent)

    # find sentence structure
    sent_nsubj = ''
    sent_ROOT = ''
    sent_dobj = ''
    for token in doc:
        if 'nsubj' in token.dep_:
            sent_nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            sent_ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            sent_dobj = lemmatize(strip_punctuation(token.text))
            
    # find answer with highest score
    max_score = -1
    answer = ''
    
    if qtype == 'who':
        for np in doc.noun_chunks:
            score = 0
            
            if np in doc.ents:
                for ent in doc.ents:
                    if np.text in ent.text and ent.label_ == 'PERSON':
                            score += 3

            # find NP dependency
            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1

            if np.text.lower() == 'it':
                score = -1
                
            if score > max_score:
                max_score = score
                answer = np.text

    elif qtype == 'when':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'TIME' or ent.label_ == "DATE":
                score += 3
                
            if ent.text not in question:
                score += 1
            
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'where':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'GPE' or ent.label_ == "LOC":
                score += 3

            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text
            
    elif qtype in ['LANGUAGE', 'WORK_OF_ART', 'EVENT', 'NORP', 'FACILITY',
                   'GPE', 'DATE', 'TIME', 'PERCENT', 'QUANTITY', 'CARDINAL',
                   'MONEY', 'PERSON', 'ORG', 'LOC']:
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == qtype:
                score += 3
            
            if ent.text not in question:
                score += 1
                
            if qtype in ['LOC','GPE'] and ent.root.tag_ not in ['NN','NNP','NNS','NNPS']:
                score -= 2
                
            if score > max_score:
                max_score = score
                answer = ent.text
                
                if qtype in ['MONEY']:
                    for token in doc:
                        if token.text == '$':
                            answer = '$ ' + answer
                            
                if qtype in ['PERCENT']:
                    if 'percent' in answer:
                        answer = answer[:answer.index('percent')-1]
                            
                if qtype in ['PERCENT','QUANTITY','CARDINAL','MONEY']:
                    tokens = nltk.word_tokenize(answer)
                    i = 0
                    answer = ''
                    while i < len(tokens):
                        if tokens[i].lower() in ['well','about','around','approximately', 'some','least','close to','than','high as','least']:
                            del tokens[i]
                        else:
                            if i+1 < len(tokens) and tokens[i+1] == "'s":
                                answer += tokens[i]
                            else:
                                answer += tokens[i] + ' '
                            i += 1
                    answer = answer.strip()
                    
    elif qtype == 'NE':
        for ent in doc.ents:
            score = 3
            
            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'abrv':
        abrv = ''
        qdoc = nlp(question)
        for token in qdoc:
            text = token.text
            if len(text) >= 2 and text.isupper() and text.isalpha():
                abrv = text.lower()

        if abrv == '' and 'stand for' in question:
            tokens = question.lower().split(' ')
            abrv = tokens[tokens.index('stand')-1]

        if abrv != '':
            tokens = nltk.word_tokenize(sent)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    phrase = token.lower()
                    initials = phrase[0]

                    while i+k < len(tokens) and tokens[i+k][0].isupper():
                        phrase = phrase + ' ' + tokens[i+k].lower()
                        initials += tokens[i+k][0].lower()
                        k += 1

                    phrase = phrase.strip()
                    if initials == abrv:
                        answer = phrase

        else:
            tokens = nltk.word_tokenize(question)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    initials = token[0].lower()

                    while i + k < len(tokens) and tokens[i + k][0].isupper():
                        initials += tokens[i + k][0].lower()
                        k += 1

                    if len(initials) >= 2:
                        answer = initials

    elif qtype == 'adj':
        for token in doc:
            score = 0
            
            if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                score += 3

            token_dep = token.dep_
            token_head = lemmatize(strip_punctuation(token.head.text))
            token_head_dep = token.head.dep_

            if token_dep == dep:
                score += 1
            if token_head == head:
                score += 1
            if token_head_dep == head_dep:
                score += 1

            if token.text not in question:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if token.text.lower() == 'it':
                score = -1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'verb':
        for token in doc:
            score = 0

            if token.dep_ == 'ROOT':
                score += 1

            if lemmatize(strip_punctuation(token.text)) not in \
                    [lemmatize(strip_punctuation(s)) for s in nltk.word_tokenize(question)]:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'closed':
        first = closed_q_choices[0]
        second = closed_q_choices[1]

        # whether each option appears (and is negates)
        appear1 = False
        appear2 = False
        negate1 = False
        negate2 = False
        neg_count = 0
        tokens = nltk.word_tokenize(raw_docs[id])

        for (index, token) in enumerate(tokens):
            if token == 'not' or "n't" in token:
                neg_count += 1

                if index+1 < len(tokens):
                    if tokens[index+1] == first:
                        negate1 = True
                    if tokens[index+1] == second:
                        negate2 = True

            if token == first:
                appear1 = True
            if token == second:
                appear2 = True

        possible_answer = ''
        if appear1 and not appear2:
            if neg_count % 2 == 1:
                possible_answer = second
            else:
                possible_answer = first

        elif appear2 and not appear1:
            if neg_count % 2 == 0:
                possible_answer = second
            else:
                possible_answer = first

        elif appear1 and appear2:
            if negate1 and not negate2:
                possible_answer = second
            elif negate2 and not negate1:
                possible_answer = first
            else:
                possible_answer = second

        if possible_answer != '':
            score += 5
            if score > max_score:
                max_score = score
                answer = possible_answer

    elif qtype == 'why':

        possible_answer = ''
        score = 0

        if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:

            if 'because of' in sent:
                score += 3
                index = sent.index('because of')
                substr = sent[index+11:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break

            elif 'because' in sent:
                score += 3
                index = sent.index('because')
                substr = sent[index + 8:]
                possible_answer = substr

            elif 'due to' in sent:
                score += 3
                index = sent.index('due to')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'reason' in sent:
                score += 2
                index = sent.index('reason')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    index = substr.find('is')
                    if index != -1:
                        possible_answer = substr[index+3]
                    else:
                        index = substr.find('was')
                        if index != -1:
                            possible_answer = substr[index+4]
                        else:
                            possible_answer = sent[sent.index('reason'):]

            elif 'for' in sent:
                score += 1
                index = sent.index('for')
                substr = sent[index + 4:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'since' in sent:
                score += 1
                index = sent.index('since')
                substr = sent[index + 6:]
                possible_answer = substr

            if possible_answer != '' and score > max_score:
                answer = possible_answer
                max_score = score

    # if answer not found, find noun phrases
    if answer == '':
        for np in doc.noun_chunks:
            score = 0

            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = np.text
                
    a = nltk.word_tokenize(answer)
    if len(a) > 0 and a[0].lower() in stop:
        del a[0]
        answer = ''
        for i in range(len(a)):
            if i+1 < len(a) and a[i+1] == "'s":
                answer += a[i]
            else:
                answer += a[i] + ' '
        
    answer = answer.strip().lower()
    answer_result = []
    answer_result.append(case_count)
    answer_result.append(answer)
    writer.writerow(answer_result)
    print(case_count,' ',answer)
    case_count += 1
    if answer == '':
        empty_count += 1
    
csvFile.close()
print(empty_count)

0   combination
1   components
2   browser's layout engine
3   internet explorer
4   september 2008
5   windows
6   1990
7   browsers
8   andreessen
9   first web browser
10   competition
11   microsoft
12   internet relay chat
13   january
14   every major web browser
15   january 2003
16   january 2009
17   file transfer protocol
18   google
19   case
20   rich user interfaces
21   january 2009
22   august 2011
23   browser software
24   chrome's user-base
25   form
26   december 2011
27   september 2008
28   netscape
29   opera-mini version
30   rapid development
31   prefix
32   mozilla foundation
33   private networks
34   windows
35   comparison
36   1994
37   user interface
38   addition
39   major browsers
40   information resources
41   live bookmarks
42   more traditional feed reader
43   bookmarks
44   september 2008
45   prefix
46   browser software
47   world wide web
48   microsoft corp
49   mobile safari
50   web browsers
51   other hand
52   user's default e-mail applic

466   27 june 1640
467   college
468   9th century
469   1339
470   1233
471   trust
472   biggest operator
473   4.2
474   1959
475   354
476   government figures
477   m27
478   uk
479   hanover buildings
480   city
481   year
482   12th century
483   traffic congestion
484   southampton
485   2004
486   1233
487   hampshire
488   three fire stations
489   southampton docks
490   two
491   m27
492   southampton's largest retail centre
493   13th century
494   1066
495   clausentum
496   two
497   plans
498   southampton
499   port
500   over a quarter
501   southampton central
502   20–24
503   council estates
504   portswood, banister park
505   over a quarter
506   king henry's departure
507   m3 motorway
508   trust
509   catherine mcewing
510   16.2
511   nuffield theatre
512   newport
513   university of southampton
514   france
515   swaythling
516   1968
517   area
518   hampshire county council
519   duchess
520   world's largest cruise ships
521   1938
522   large shopping c

916   candidates
917   neptune's more varied weather
918   reaction
919   hobbes
920   samuel pufendorf
921   culture
922   citation
923   19th-century
924   terror management theory
925   29
926   united states
927   germany
928   end of the last ice age
929   19th-century
930   contrasted
931   terror management theory
932   immanuel kant
933   1970s
934   european
935   united states
936   diffusion
937   prussian linguist
938   1950s and 1960s
939   stuart hall
940   élite ideal
941   present legislation
942   jefferson's metaphor
943   1971
944   nonconformists
945   areas
946   pervasive secularism
947   wall
948   may 3, 2006
949   still other scholars
950   early as the mid-17th century
951   religious freedom
952   engel
953   madison
954   december 20, 2005
955   william penn
956   two
957   legal scholars
958   still other scholars
959   reynolds
960   1776
961   court's decision
962   opponents
963   december
964   robert s. wood
965   u.s.
966   1962
967   court
968   1994

1327   year
1328   three
1329   bulgaria
1330   alexander ii
1331   french
1332   deployment
1333   kept
1334   evacuation
1335   12 july
1336   339
1337   august 1855
1338   black sea
1339   31 december 1853
1340   alfred nobel
1341   george hamilton-gordon
1342   baltic was [ when
1343   32–40
1344   alliance
1345   local commanders
1346   part
1347   russia
1348   postponement
1349   omar pasha
1350   conflict
1351   parliament
1352   constantinople
1353   åland islands
1354   alexander ii
1355   nicholas
1356   cardigan
1357   attack
1358   :104:19
1359   start
1360   october 1853
1361   french
1362   sunday
1363   year up to
1364   public opinion
1365   1855: georgian coast
1366   winter of 1854
1367   sinop
1368   russian cavalry movement
1369   part
1370   peaceful settlement
1371   crimean war
1372   baltic was [ when
1373   vidin
1374   reaction
1375   chetatea
1376   danube river
1377   ottoman forces
1378   william howard russell
1379   june 24, 1839
1380   local commanders


1779   2009
1780   fate
1781   merriam-webster's
1782   1995
1783   dst clock shifts
1784   negative dst
1785   britain
1786   standard time
1787   autumn and two hours
1788   merriam-webster's
1789   two hours
1790   dst
1791   europe
1792   01:00 utc
1793   2007
1794   fixed work schedules
1795   example
1796   dst
1797   ramadan (the month
1798   reduces
1799   1970s
1800   older form
1801   march 2011
1802   year
1803   us
1804   winter
1805   north america
1806   1895
1807   ntfs
1808   
1809   northern summer
1810   initial adoption
1811   countries
1812   dst inherits
1813   dst
1814   filesystem
1815   2007
1816   willett
1817   us retailing and manufacturing interests
1818   st
1819   advantages
1820   time zone differences
1821   early goal
1822   one reason
1823   times
1824   regions
1825   kingsford charcoal
1826   clock shifts
1827   many enactments
1828   one
1829   gregorian calendar
1830   autumn and two hours
1831   two
1832   dst
1833   clocks
1834   iceland
1835   e

2193   1949
2194   skeletal remain
2195   two
2196   sense
2197   falsify
2198   recent work
2199   generally accepted concept
2200   greatest number
2201   many
2202   survey
2203   contrast
2204   30 %
2205   clustering
2206   today
2207   today
2208   2000
2209   brazilian child
2210   racism
2211   suggest
2212   populations
2213   population geneticist sewall wright
2214   phylogenetic analysis
2215   initial hypotheses
2216   east asians
2217   populations
2218   races
2219   significant number
2220   700.000
2221   thousands
2222   diagnosis
2223   concept
2224   eduardo bonilla-silva
2225   physical anthropologists
2226   adversely
2227   mass incarceration
2228   racial discrimination
2229   france
2230   y chromosomes
2231   5 %
2232   roughly 28–37 %
2233   93 %
2234   employs
2235   international epidemiological data
2236   cranial measurements
2237   different parts
2238   distinction
2239   1964
2240   another way
2241   blumenbach
2242   good arguments
2243   word
2244  

2606   elevator cab
2607   germany
2608   low-volume hours
2609   double deck elevators
2610   make
2611   citation
2612   israeli
2613   hydraulic crane
2614   four
2615   action
2616   city transport
2617   passenger cabs
2618   passenger cabs
2619   unique design characteristics
2620   residential
2621   end
2622   anywhere up to 60 %
2623   may
2624   300,000
2625   passenger elevators
2626   less expensive installations
2627   1
2628   action
2629   steam driven devices
2630   direction lanterns
2631   morning
2632   dumbwaiters
2633   hydraulic freight elevators
2634   first elevator shaft
2635   citation
2636   shaft
2637   single bulkhead cylinders
2638   machine-room-less elevators
2639   `` shaft
2640   neapolitan architect
2641   barrel
2642   weight
2643   london
2644   method
2645   environmental concerns
2646   belt elevators
2647   team's earliest exit
2648   1189
2649   eight
2650   21 september 1949
2651   1 may
2652   1872
2653   2001 and 2006
2654   february 2012
265

3020   dial-up access
3021   mahmud ghaznavi
3022   citation
3023   citation
3024   technological advancements
3025   30 december 2007
3026   first
3027   nepal
3028   h.p.
3029   private fm stations
3030   18 december 1970
3031   state
3032   1.16%
3033   radio
3034   2006
3035   handicrafts
3036   1948
3037   muslims
3038   first five-year plan
3039   extreme variation
3040   others
3041   senate
3042   side
3043   imperial era
3044   haruspex
3045   security
3046   edict
3047   principate
3048   camp
3049   constantine
3050   roman women
3051   valerian's first religious edict
3052   famous tirade
3053   human sacrifice
3054   strong connections
3055   same divine agencies
3056   vergil
3057   return
3058   di immortales
3059   arvals
3060   traditional roman practice
3061   ruins
3062   abolition
3063   ordinary romans
3064   public festivals
3065   end
3066   product
3067   customary offers
3068   return
3069   rome's hegemony
3070   relationship
3071   veneration
3072   cases
307

3444   dialect
3445   position
3446   german dialectology
3447   latin
3448   middle ages
3449   dialect
3450   1860s
3451   sardinia
3452   19th century
3453   language—"everybody
3454   italian "avere
3455   german
3456   caribbean coast
3457   italy
3458   british and american english
3459   german
3460   northern germany
3461   more general term
3462   world war i
3463   significance
3464   latin
3465   yiddish
3466   1967
3467   italian "avere
3468   political factors
3469   italians
3470   ion bărbuţă
3471   english
3472   english
3473   german
3474   red clothing
3475   christian countries
3476   red
3477   1921
3478   seven
3479   karl marx
3480   red ochre
3481   christian tradition
3482   16th century
3483   one to two percent
3484   1960s
3485   danger
3486   red ochre
3487   2004
3488   cincinnati red stockings
3489   coccus
3490   roman general
3491   red
3492   1960
3493   ray
3494   harvard university
3495   communist party
3496   1950s and 1960s
3497   culture
3498   pa

#  准确率测试

In [14]:
with open('training.json') as json_data:
    train = json.load(json_data)
    
case_count = 0
empty_count = 0
wrong_count = 0
# test = [test[17]]
for test_case in train:
    question = test_case['question']
    docid = test_case['docid']
    correct_answer = test_case['text']

    # Convert doc into one string, then tokenize sentences
    corpus = ''
    for para in documents[docid]['text']:
        corpus += para + ' '

    # sentence as a document
    raw_docs = nltk.sent_tokenize(corpus)
    

    # TFIDF
#     doc_term_freqs = {}
#     for (id, raw_doc) in enumerate(raw_docs):
#         term_freqs = extract_term_freqs(raw_doc)
#         doc_term_freqs[id] = term_freqs
#     M = len(doc_term_freqs)

#     doc_freqs = compute_doc_freqs(doc_term_freqs)

#     vsm_inverted_index = defaultdict(list)
#     for docid, term_freqs in doc_term_freqs.items():
#         N = sum(term_freqs.values())
#         length = 0

#         # find tf*idf values and accumulate sum of squares
#         tfidf_values = []
#         for term, count in term_freqs.items():
#             tfidf = float(count) / N * log(M / float(doc_freqs[term]))
#             tfidf_values.append((term, tfidf))
#             length += tfidf ** 2

#         # normalise documents by length and insert into index
#         length = length ** 0.5
#         for term, tfidf in tfidf_values:
#             # inversion of the indexing, term -> (doc_id, score)
#             vsm_inverted_index[term].append([docid, tfidf / length])

#     for term, docids in vsm_inverted_index.items():
#         docids.sort()

#     terms = extract_term_freqs(question) 
#     results = query_vsm(terms, vsm_inverted_index)
    
    
    tokenized_sentence = []
    for each_sentence in raw_docs:
        filter_stop_word = []
        sentence_as_words = nltk.word_tokenize(each_sentence)
        for each_word in sentence_as_words:
            if each_word not in stop:
                filter_stop_word.append(each_word)
        
        tokenized_sentence.append(filter_stop_word)
        
    bm25Model = bm25.BM25(tokenized_sentence)
    average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
    
    query = []
    for word in nltk.word_tokenize(question):
        if word not in stop:
            query.append(word)
        
    scores = bm25Model.get_scores(query,average_idf)
    bm25_dic = Counter()
    
    sentence_id = 0
    for each_score in scores:
        bm25_dic[sentence_id] = each_score
        sentence_id += 1
    results = bm25_dic.most_common(3)




    # Step 2
    # Analyse question type
    qword = get_qword(question)

    # the word after question word, such as 'what value', 'which gender'
    next_token = ''

    qtype = ''

    # dependency parsing
    dep = ''

    # head word
    head = ''

    # head dependency
    head_dep = ''

    # subject, root, object
    nsubj = ''
    ROOT = ''
    dobj = ''

    # yes or no questions have two options
    closed_q_choices = ('', '')

    doc = nlp(question)

    tokens = nltk.word_tokenize(question.lower())

    # get next word
    if qword in tokens:
        if tokens.index(qword) < len(tokens) - 1:
            next_token = tokens[tokens.index(qword) + 1]

    # get structure of sentence
    for token in doc:
        if 'nsubj' in token.dep_:
            nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            dobj = lemmatize(strip_punctuation(token.text))

    # for noun (phrase) questions, get answer dependency
    for chunk in doc.noun_chunks:
        if qword in chunk.text:
            dep = chunk.root.dep_
            head = lemmatize(strip_punctuation(chunk.root.head.text))
            head_dep = chunk.root.head.dep_

    # determine answer type
    if 'stand for' in question or 'abbreviat' in question:
        qtype = 'abrv'

    elif qword in ['who',"who's",'whom','whose']:
        qtype = 'who'

    elif qword == 'when':
        qtype = 'when'

    elif qword in ['where',"where's"]:
        qtype = 'where'

    elif qword in ['how',"how's"]:
        if next_token == 'much':
            qtype = 'MONEY'
        elif next_token == 'many':
            qtype = 'CARDINAL'
        elif next_token == 'long':
            qtype = 'DATE'
        elif next_token in ['far','big','wide','deep','tall','high','fast','heavy']:
            qtype = 'QUANTITY'
        elif next_token in ['old','young']:
            qtype = 'DATE'
        elif next_token in ['often']:
            qtype = 'FREQUENCY'        
        elif next_token in ['does','did','do','have','has','had','should',
                              'can','could','will','would','must']:
            if dobj != '':
                qtype = 'adj'
            else:
                qtype = 'verb'

    elif qword in ['what', "what's", 'which']:
        if 'year'in tokens or \
                'day' in tokens or \
                'month' in tokens or \
                'era' in tokens or \
                'age' in tokens or \
                'century' in tokens or \
                'week' in tokens or \
                'period' in tokens or \
                'dynasty' in tokens:
            qtype = 'DATE'

        elif 'company' in tokens or \
                'organization' in tokens or \
                'organisation' in tokens or \
                'corporation' in tokens or \
                'institution' in tokens or \
                'university' in tokens or \
                'corporation' in tokens or \
                'association' in tokens or \
                'union' in tokens or \
                'agency' in tokens:
            qtype = 'ORG'

        elif 'city' in tokens or \
                'country' in tokens or \
                'state' in tokens or \
                'province' in tokens or \
                'county' in tokens:
            qtype = 'GPE'

        elif 'place' in tokens or \
                'river' in tokens or \
                'mountain' in tokens or \
                'ocean' in tokens or \
                'region' in tokens or \
                'area' in tokens or \
                'sea' in tokens or \
                'lake' in tokens or \
                'continent' in tokens or \
                'location' in tokens or \
                'forest' in tokens or \
                'jungle' in tokens:
            qtype = 'LOC'

        elif 'nationality' in tokens:
            qtype = 'NORP'

        elif 'building' in tokens or \
            'airport' in tokens or \
            'highway' in tokens or \
            'bridge' in tokens or \
            'harbour' in tokens or \
            'harbor' in tokens or \
            'port' in tokens or \
            'dam' in tokens:
            qtype = 'FACILITY'

        elif 'hurricane' in tokens or \
            'battle' in tokens or \
            'war' in tokens:
            qtype = 'EVENT'

        elif 'book' in tokens or \
            'novel' in tokens or \
            'song' in tokens or \
            'music' in tokens or \
            'painting' in tokens:
            qtype = 'WORK_OF_ART'

        elif 'language' in tokens or \
                'speak' in tokens:
            qtype = 'LANGUAGE'

        elif 'percentage' in tokens or 'percent' in tokens:
            qtype = 'PERCENT'

        elif 'value' in tokens or \
                'distance' in tokens or \
                'size' in tokens or \
                'length' in tokens or \
                'depth' in tokens or \
                'height' in tokens or \
                'density' in tokens or \
                'speed' in tokens or \
                'weight' in tokens or \
                'area' in tokens or \
                'temperature' in tokens or \
                'volume' in tokens:
            qtype = 'QUANTITY'

        elif 'number' in tokens:
            qtype = 'CARDINAL'

        elif 'price' in tokens:
            qtype = 'MONEY'

        elif 'name' in tokens:
            qtype = 'NE'

        else:
            # what...do type question
            try:
                tokens.remove(next_token)
            except:
                tokens = tokens
            if 'do' in tokens:
                qtype = 'verb'
            else:
                qtype = 'noun'

    elif qword == 'why':
        qtype = 'why'

    elif qword in CLOSED_QUESTION_WORDS:
        qtype = 'closed'

        # answer is one of the 'or' options in the question
        if 'or' in tokens:
            index = tokens.index('or')
            prev1 = tokens[index - 1]
            next1 = tokens[index + 1]
            tag_tokens = nltk.pos_tag(tokens)

            tag = tag_tokens[index - 1][1]

            # if answer is a noun
            if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
                for chunk in doc.noun_chunks:
                    if prev1 in chunk.text:
                        first = chunk.text
                    if next1 in chunk.text:
                        second = chunk.text
                closed_q_choices = (first, second)
            else:
                closed_q_choices = (prev1, next1)
        else:
            qtype = 'others'

    # re-rank the 20 sentences
    scores = {}
    for id, _ in results:
        sent = raw_docs[id]
        doc = nlp(sent)

        score = get_overlap(sent, question)

        if qtype == 'who':
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    score += 1

        elif qtype == 'when':
            for ent in doc.ents:
                if ent.label_ == 'TIME' or ent.label_ == "DATE":
                    score += 1

        elif qtype == 'where':
            for ent in doc.ents:
                if ent.label_ == 'GPE' or ent.label_ == "LOC":
                    score += 1

        elif qtype in ['LANGUAGE','WORK_OF_ART','EVENT','NORP','FACILITY',
                       'GPE','DATE','TIME','PERCENT','QUANTITY','CARDINAL',
                     'MONEY','PERSON','ORG','LOC','FREQUENCY']:
            for ent in doc.ents:
                if ent.label_ == qtype:
                    score += 1
                    
        elif qtype == 'NE':
            for ent in doc.ents:
                    score += 1

        elif qtype == 'adj':
            for token in doc:
                if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                    score += 1

        elif qtype == 'verb':
            for token in doc:
                if token.dep_ == 'ROOT':
                    score += 1

        elif qtype == 'closed':
            first = closed_q_choices[0]
            second = closed_q_choices[1]

            score += (first in sent) + (second in sent)

        elif qtype == 'why':
            if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:
                score += 1

        scores[id] = score

    rank = {}
    for id, sim in results:
        max_score = scores[max(scores, key=scores.get)]
        if max_score != 0:
            rank[id] = sim * 0.5 + (scores[id] / max_score * 0.5)
        else:
            rank[id] = sim
    
    # sentence with highest rank
    index = max(rank, key=rank.get,default=0)
    sent = raw_docs[index]
    doc = nlp(sent)

    # find sentence structure
    sent_nsubj = ''
    sent_ROOT = ''
    sent_dobj = ''
    for token in doc:
        if 'nsubj' in token.dep_:
            sent_nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            sent_ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            sent_dobj = lemmatize(strip_punctuation(token.text))
            
    # find answer with highest score
    max_score = -1
    answer = ''
    
    if qtype == 'who':
        for np in doc.noun_chunks:
            score = 0
            
            if np in doc.ents:
                for ent in doc.ents:
                    if np.text in ent.text and ent.label_ == 'PERSON':
                            score += 3

            # find NP dependency
            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1

            if np.text.lower() == 'it':
                score = -1
                
            if score > max_score:
                max_score = score
                answer = np.text

    elif qtype == 'when':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'TIME' or ent.label_ == "DATE":
                score += 3
                
            if ent.text not in question:
                score += 1
            
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'where':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'GPE' or ent.label_ == "LOC":
                score += 3

            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text
            
    elif qtype in ['LANGUAGE', 'WORK_OF_ART', 'EVENT', 'NORP', 'FACILITY',
                   'GPE', 'DATE', 'TIME', 'PERCENT', 'QUANTITY', 'CARDINAL',
                   'MONEY', 'PERSON', 'ORG', 'LOC']:
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == qtype:
                score += 3
            
            if ent.text not in question:
                score += 1
                
            if qtype in ['LOC','GPE'] and ent.root.tag_ not in ['NN','NNP','NNS','NNPS']:
                score -= 2
                
            if score > max_score:
                max_score = score
                answer = ent.text
                
                if qtype in ['MONEY']:
                    for token in doc:
                        if token.text == '$':
                            answer = '$ ' + answer
                            
                if qtype in ['PERCENT']:
                    if 'percent' in answer:
                        answer = answer[:answer.index('percent')-1]
                            
                if qtype in ['PERCENT','QUANTITY','CARDINAL','MONEY']:
                    tokens = nltk.word_tokenize(answer)
                    i = 0
                    answer = ''
                    while i < len(tokens):
                        if tokens[i].lower() in ['well','about','around','approximately', 'some','least','close to','than','high as','least']:
                            del tokens[i]
                        else:
                            if i+1 < len(tokens) and tokens[i+1] == "'s":
                                answer += tokens[i]
                            else:
                                answer += tokens[i] + ' '
                            i += 1
                    answer = answer.strip()
                    
    elif qtype == 'NE':
        for ent in doc.ents:
            score = 3
            
            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'abrv':
        abrv = ''
        qdoc = nlp(question)
        for token in qdoc:
            text = token.text
            if len(text) >= 2 and text.isupper() and text.isalpha():
                abrv = text.lower()

        if abrv == '' and 'stand for' in question:
            tokens = question.lower().split(' ')
            abrv = tokens[tokens.index('stand')-1]

        if abrv != '':
            tokens = nltk.word_tokenize(sent)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    phrase = token.lower()
                    initials = phrase[0]

                    while i+k < len(tokens) and tokens[i+k][0].isupper():
                        phrase = phrase + ' ' + tokens[i+k].lower()
                        initials += tokens[i+k][0].lower()
                        k += 1

                    phrase = phrase.strip()
                    if initials == abrv:
                        answer = phrase

        else:
            tokens = nltk.word_tokenize(question)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    initials = token[0].lower()

                    while i + k < len(tokens) and tokens[i + k][0].isupper():
                        initials += tokens[i + k][0].lower()
                        k += 1

                    if len(initials) >= 2:
                        answer = initials

    elif qtype == 'adj':
        for token in doc:
            score = 0
            
            if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                score += 3

            token_dep = token.dep_
            token_head = lemmatize(strip_punctuation(token.head.text))
            token_head_dep = token.head.dep_

            if token_dep == dep:
                score += 1
            if token_head == head:
                score += 1
            if token_head_dep == head_dep:
                score += 1

            if token.text not in question:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if token.text.lower() == 'it':
                score = -1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'verb':
        for token in doc:
            score = 0

            if token.dep_ == 'ROOT':
                score += 1

            if lemmatize(strip_punctuation(token.text)) not in \
                    [lemmatize(strip_punctuation(s)) for s in nltk.word_tokenize(question)]:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'closed':
        first = closed_q_choices[0]
        second = closed_q_choices[1]

        # whether each option appears (and is negates)
        appear1 = False
        appear2 = False
        negate1 = False
        negate2 = False
        neg_count = 0
        tokens = nltk.word_tokenize(raw_docs[id])

        for (index, token) in enumerate(tokens):
            if token == 'not' or "n't" in token:
                neg_count += 1

                if index+1 < len(tokens):
                    if tokens[index+1] == first:
                        negate1 = True
                    if tokens[index+1] == second:
                        negate2 = True

            if token == first:
                appear1 = True
            if token == second:
                appear2 = True

        possible_answer = ''
        if appear1 and not appear2:
            if neg_count % 2 == 1:
                possible_answer = second
            else:
                possible_answer = first

        elif appear2 and not appear1:
            if neg_count % 2 == 0:
                possible_answer = second
            else:
                possible_answer = first

        elif appear1 and appear2:
            if negate1 and not negate2:
                possible_answer = second
            elif negate2 and not negate1:
                possible_answer = first
            else:
                possible_answer = second

        if possible_answer != '':
            score += 5
            if score > max_score:
                max_score = score
                answer = possible_answer

    elif qtype == 'why':

        possible_answer = ''
        score = 0

        if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:

            if 'because of' in sent:
                score += 3
                index = sent.index('because of')
                substr = sent[index+11:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break

            elif 'because' in sent:
                score += 3
                index = sent.index('because')
                substr = sent[index + 8:]
                possible_answer = substr

            elif 'due to' in sent:
                score += 3
                index = sent.index('due to')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'reason' in sent:
                score += 2
                index = sent.index('reason')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    index = substr.find('is')
                    if index != -1:
                        possible_answer = substr[index+3]
                    else:
                        index = substr.find('was')
                        if index != -1:
                            possible_answer = substr[index+4]
                        else:
                            possible_answer = sent[sent.index('reason'):]

            elif 'for' in sent:
                score += 1
                index = sent.index('for')
                substr = sent[index + 4:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'since' in sent:
                score += 1
                index = sent.index('since')
                substr = sent[index + 6:]
                possible_answer = substr

            if possible_answer != '' and score > max_score:
                answer = possible_answer
                max_score = score

    # if answer not found, find noun phrases
    if answer == '':
        for np in doc.noun_chunks:
            score = 0

            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = np.text
                
    a = nltk.word_tokenize(answer)
    if len(a) > 0 and a[0].lower() in stop:
        del a[0]
        answer = ''
        for i in range(len(a)):
            if i+1 < len(a) and a[i+1] == "'s":
                answer += a[i]
            else:
                answer += a[i] + ' '
        
    if correct_answer != answer.strip().lower():
        wrong_count += 1
        print('Support: ',sent)
        print('***')
        print('Question: ',question)
        print('***')
        print('Correct: ',correct_answer)
        print('***')
        print('predicted: ',answer.strip().lower())
        print('\n')
        print('\n')

    case_count += 1
    if answer == '':
        empty_count += 1
    
csvFile.close()
print('Empty: ',empty_count)
print('Wrong: ',wrong_count)
print('Total: ',case_count - 1)

Support:  A change of several tens of micrograms in one kilogram is equivalent to the current uncertainty in the value of the Planck constant in SI units.
***
Question:  A kilogram could be definined as having a Planck constant of what value?
***
Correct:  6966662606895999999♠6.62606896×10−34 j⋅s
***
predicted:  several tens




Support:  The most urgent unit on the list for redefinition is the kilogram, whose value has been fixed for all science (since 1889) by the mass of a small cylinder of platinum–iridium alloy kept in a vault just outside Paris.
***
Question:  What is the shape of the object that establishes the base unit of the kilogram?
***
Correct:  cylinder
***
predicted:  most urgent unit




Support:  In modern terms, if J is the total angular momentum of a system with rotational invariance, and Jz the angular momentum measured along any given direction, these quantities can only take on the values where the uncertainty is given as the standard deviation of the measured val

Support:  In the last years of the nineteenth century, Planck was investigating the problem of black-body radiation first posed by Kirchhoff some forty years earlier.
***
Question:  Which scientist associated the Planck constant with a quantum five years after Planck's recognition?
***
Correct:  einstein
***
predicted:  last years




Support:  For example, green light with a wavelength of 555 nanometres (the approximate wavelength to which human eyes are most sensitive) has a frequency of 7014540000000000000♠540 THz (7014540000000000000♠540×1012 Hz).
***
Question:  What is the frequency of the light to which the human eye is most sensitive?
***
Correct:  7014540000000000000♠540 thz
***
predicted:  example




Support:  These proofs are commonly known as the "ultraviolet catastrophe", a name coined by Paul Ehrenfest in 1911.
***
Question:  What name did Paul Ehrenfest give to the proofs from Einstein and Rayleigh & Jeans?
***
Correct:  the `` ultraviolet catastrophe ''
***
predicted:  

Support:  The result is that green light of wavelength 555 nm has an energy of 7005216000000000000♠216 kJ/mol, a typical energy of everyday life.
***
Question:  How much energy does a green light of wavelength 555 nm contain?
***
Correct:  7005216000000000000♠216 kj/mol
***
predicted:  7005216000000000000




Support:  The correct quantization rules for electrons – in which the energy reduces to the Bohr model equation in the case of the hydrogen atom – were given by Heisenberg's matrix mechanics in 1925 and the Schrödinger wave equation in 1926: the reduced Planck constant remains the fundamental quantum of angular momentum.
***
Question:  What scientist used matrix mechanics to bring electron behavior in line with the Bohr model?
***
Correct:  heisenberg
***
predicted:  electrons




Support:  The Rayleigh–Jeans law makes close predictions for a narrow range of values at one limit of temperatures, but the results diverge more and more strongly as temperatures increase.
***
Question: 

Support:  Solar thermal power stations include the 354 megawatt (MW) Solar Energy Generating Systems power plant in the USA, Solnova Solar Power Station (Spain, 150 MW), Andasol solar power station (Spain, 100 MW), Nevada Solar One (USA, 64 MW), PS20 solar power tower (Spain, 20 MW), and the PS10 solar power tower (Spain, 11 MW).
***
Question:  Where is the PS20 solar power tower located?
***
Correct:  spain
***
predicted:  usa




Support:  Solar electricity consumption increased by 58 percent, to 93 terawatt-hours (TWh).
***
Question:  In 2012, solar electricity consumption increased by what percentage?
***
Correct:  58 percent
***
predicted:  58




Support:  Use of wind power in 2012 increased by 18.1 percent, to 521.3 TWh.
***
Question:  Use of wind power in 2012 increased by what percentage?
***
Correct:  18.1 percent
***
predicted:  18.1




Support:  Based on REN21's 2014 report, renewables contributed 19 percent to our energy consumption and 22 percent to our electricity gener

Support:  Innovation has helped, but the main driver of reduced costs has been market expansion.
***
Question:  What is the main driver of reduced costs?
***
Correct:  market expansion
***
predicted:  innovation




Support:  Geothermal power capacity grew from around 1 GW in 1975 to almost 10 GW in 2008.
***
Question:  What was the geothermal capacity in 1975?
***
Correct:  1 gw
***
predicted:  geothermal power capacity




Support:  A 2011 IEA report said: "A portfolio of renewable energy technologies is becoming cost-competitive in an increasingly broad range of circumstances, in some cases providing investment opportunities without the need for specific economic support," and added that "cost reductions in critical technologies, such as wind and solar, are set to continue."
***
Question:  What group stated that "cost reductions in critical technologies, such as wind and solar, are set to continue?"
***
Correct:  iea
***
predicted:  2011 iea report




Support:  As of 2011[update], 

Support:  Special praise of this film by the influential British critic Dilys Powell was highly significant to Spielberg's career.
***
Question:  What British critic helped boost Steven Spielberg's career?
***
Correct:  dilys powell
***
predicted:  special praise




Support:  This theme is arguably the most autobiographical aspect of Spielberg's films, since Spielberg himself was affected by his parents' divorce as a child and by the absence of his father.
***
Question:  Which of Spielberg's parents liked sci-fi?
***
Correct:  his father
***
predicted:  theme




Support:  However, much to the surprise of many, Spielberg did not get a Best Director nomination.
***
Question:  When did Spielberg and Irving get back together?
***
Correct:  1984
***
predicted:  spielberg




Support:  It was set for release on April 25, 2014, with Anne Hathaway and Chris Hemsworth set to star, but Spielberg postponed production indefinitely in January 2013, just before it had been set to begin.
***
Questi

Support:  Munich depicts Avner as a man away from his wife and newborn daughter.
***
Question:  What issue did Spielberg address in his movie Munich?
***
Correct:  terrorism
***
predicted:  avner




Support:  Spielberg was born in Cincinnati, Ohio, to an Orthodox Jewish family.
***
Question:  Where was Steven Spielberg born?
***
Correct:  cincinnati , ohio
***
predicted:  cincinnati




Support:  In 2011, Spielberg launched Falling Skies, a science fiction television series, on the TNT network.
***
Question:  Which channel aired 'Falling Skies'?
***
Correct:  tnt
***
predicted:  spielberg




Support:  It was released just four days after The Adventures of Tintin, on December 25, 2011.
***
Question:  When did Adventures of Tintin debut?
***
Correct:  october 22 , 2011
***
predicted:  four days




Support:  After two forays into more serious dramatic films, Spielberg then directed the third Indiana Jones film, 1989's Indiana Jones and the Last Crusade.
***
Question:  When did the firs

Support:  Spielberg followed with War Horse, shot in England in the summer of 2010.
***
Question:  Who distributed 'War Horse'?
***
Correct:  disney
***
predicted:  spielberg




Support:  Spielberg played many of LucasArts adventure games, including the first Monkey Island games.
***
Question:  What movie was Spielberg working on when he first played Pong?
***
Correct:  jaws
***
predicted:  lucasarts




Support:  Spielberg then revisited his Close Encounters project and, with financial backing from Columbia Pictures, released Close Encounters: The Special Edition in 1980.
***
Question:  How many Oscars did Close Encounters win?
***
Correct:  two
***
predicted:  spielberg




Support:  Spielberg has filmed and is currently in post-production on an adaptation of Roald Dahl's celebrated children's story The BFG.
***
Question:  Where was 'Lincoln' filmed?
***
Correct:  richmond , virginia
***
predicted:  spielberg




Support:  Since playing Pong while filming Jaws in 1974, Spielberg has

Support:  On July 15, 2006, Spielberg was also awarded the Gold Hugo Lifetime Achievement Award at the Summer Gala of the Chicago International Film Festival, and also was awarded a Kennedy Center honour on December 3.
***
Question:  Where was Spielberg honored on Dec 3, 2006?
***
Correct:  kennedy center
***
predicted:  july 15, 2006




Support:  Years later, Spielberg recalled to a magazine interviewer, "My dad's still-camera was broken, so I asked the scoutmaster if I could tell a story with my father's movie camera.
***
Question:  What movie did Spielberg collaborate with Cruise for?
***
Correct:  minority report
***
predicted:  magazine interviewer




Support:  Bridge of Spies received positive reviews from critics, and was nominated for six Academy Awards, including Best Picture.
***
Question:  How many Academy Awards did the film "Jaws" win?
***
Correct:  three
***
predicted:  six




Support:  He was later given the opportunity to make a short film for theatrical release, the

Support:  According to Warren Buckland, these themes are portrayed through the use of low height camera tracking shots, which have become one of Spielberg's directing trademarks.
***
Question:  When did Spielberg announce what would become 'Interstellar'?
***
Correct:  june 2006
***
predicted:  warren buckland




Support:  Spielberg received a score of 47, meaning 47% of the US believes he is influential.
***
Question:  How much of the US believes Spielberg is influential?
***
Correct:  47 %
***
predicted:  47




Support:  Spielberg was born in Cincinnati, Ohio, to an Orthodox Jewish family.
***
Question:  Where was Spielberg born?
***
Correct:  cincinnati , ohio
***
predicted:  cincinnati




Support:  ":248 Jaws made Spielberg a household name and one of America's youngest multi-millionaires, allowing him a great deal of autonomy for his future projects.
***
Question:  What was the film "Jaws" nominated for?
***
Correct:  best picture
***
predicted:  248 jaws




Support:  He has b

Support:  Two assumptions underpinned the British approach to HAA fire; first, aimed fire was the primary method and this was enabled by predicting gun data from visually tracking the target and having its height.
***
Question:  What was the primary method for HAA fire?
***
Correct:  aimed fire
***
predicted:  two assumptions




Support:  Rheinmetall in Germany developed an automatic 20 mm in the 1920s and Oerlikon in Switzerland had acquired the patent to an automatic 20 mm gun designed in Germany during World War I. Germany introduced the rapid-fire 2 cm FlaK 30 and later in the decade it was redesigned by Mauser-Werke and became the 2 cm FlaK 38.
***
Question:  Which company redesigned the rapid fire 2 cm FlaK 30?
***
Correct:  mauser-werke
***
predicted:  oerlikon




Support:  Firing an RPG at steep angles poses a danger to the user, because the backblast from firing reflects off the ground.
***
Question:  An RPG fired at a steep angle has what reflecting off the ground?
***
Corr

Support:  The SAMs launched by individuals are known in the United States as the Man-Portable Air Defence Systems (MANPADS).
***
Question:  What are the SAMs called that are launched by individuals in the US?
***
Correct:  manpads
***
predicted:  sams




Support:  The British adopted "effective ceiling", meaning the altitude at which a gun could deliver a series of shells against a moving target; this could be constrained by maximum fuse running time as well as the gun's capability.
***
Question:  What term is used to describe the altitude for a gun to shoot shells against a target that is moving?
***
Correct:  `` effective ceiling ''
***
predicted:  british




Support:  The British recognised the need for anti-aircraft capability a few weeks before World War I broke out; on 8 July 1914, the New York Times reported that the British government had decided to 'dot the coasts of the British Isles with a series of towers, each armed with two quick-firing guns of special design,' while 'a

Support:  Since most attacks were at night, searchlights were soon used, and acoustic methods of detection and locating were developed.
***
Question:  What was used because of night attacks?
***
Correct:  searchlights
***
predicted:  attacks




Support:  Another aspect of anti-aircraft defence was the use of barrage balloons to act as physical obstacle initially to bomber aircraft over cities and later for ground attack aircraft over the Normandy invasion fleets.
***
Question:  What acted as a physical obstacle to anti-aircraft defence?
***
Correct:  barrage balloons
***
predicted:  another aspect




Support:  The French Brocq system was electrical, the operator entered the target range and had displays at guns; it was used with their 75 mm.
***
Question:  Which system had electrical tracking?
***
Correct:  french brocq
***
predicted:  french brocq system




Support:  Krupp's designs included adaptations of their 65 mm 9-pounder, a 75 mm 12-pounder, and even a 105 mm gun.
***
Questi

Support:  If current developments continue, some[who?]
***
Question:  What is the second purpose of air defence?
***
Correct:  destroy them
***
predicted:  current developments




Support:  Also available to the Americans at the start of the war was the 120 mm M1 gun stratosphere gun, which was the most powerful AA gun with an impressive 60,000 ft (18 km) altitude capability.
***
Question:  What was the altitude range in feet of the stratosphere gun?
***
Correct:  60,000 ft
***
predicted:  americans




Support:  However, the performance of both 3.7 and 4.5-in guns was limited by their standard fuse No 199, with a 30-second running time, although a new mechanical time fuse giving 43 seconds was nearing readiness.
***
Question:  How long of a running time did the Number 199 fuse have?
***
Correct:  30-second running time
***
predicted:  4.5-in




Support:  Second, that the target would maintain a steady course, speed and height.
***
Question:  Another assumption was that the target wo

Support:  The interceptor aircraft (or simply interceptor) is a type of fighter aircraft designed specifically to intercept and destroy enemy aircraft, particularly bombers, usually relying on high speed and altitude capabilities.
***
Question:  What kind of aircraft is used to intercept and destroy other aircraft?
***
Correct:  the interceptor aircraft
***
predicted:  interceptor aircraft




Support:  The critical issue is to hit a target moving in three-dimensional space; an attack must not only match these three coordinates, but must do so at the time the target is at that position.
***
Question:  What two things must be considered in regards to the projectile and target?
***
Correct:  speed and direction
***
predicted:  critical issue




Support:  If an attacker is able to penetrate this layer, then the next layers would come from the surface-to-air missiles carried by the carrier's escorts; the area-defence missiles, such as the RIM-67 Standard, with a range of up to 100 nmi, an

Support:  Area air defence, the air defence of a specific area or location, (as opposed to point defence), have historically been operated by both armies (Anti-Aircraft Command in the British Army, for instance) and Air Forces (the United States Air Force's CIM-10 Bomarc).
***
Question:  Armies as well as what group have operated area air defences?
***
Correct:  air forces
***
predicted:  anti-aircraft command




Support:  In Britain and some other armies, the single artillery branch has been responsible for both home and overseas ground-based air defence, although there was divided responsibility with the Royal Navy for air defence of the British Isles in World War I.
***
Question:  Which group is heavily responsible for passive air defence?
***
Correct:  ground forces
***
predicted:  britain




Support:  The US started an upgrade of their defences using the Nike Ajax missile, and soon the larger anti-aircraft guns disappeared.
***
Question:  The missile started being used more ofte

Support:  The policy was developed by the Clinton Administration in 1999.
***
Question:  Which presidential administration developed Safe Harbor policy?
***
Correct:  the clinton administration
***
predicted:  clinton administration




Support:  The whooping crane population by 1941 was estimated at about only 16 birds still in the wild.
***
Question:  What was the estimated population of the whooping crane in 1941?
***
Correct:  16 birds
***
predicted:  whooping crane population




Support:  The provision of the law in Section 4 that establishes critical habitat is a regulatory link between habitat protection and recovery goals, requiring the identification and protection of all lands, water and air necessary to recover endangered species.
***
Question:  What section of the Endangered Species Act establishes critical habitat regulations?
***
Correct:  section 4
***
predicted:  provision




Support:  The Secretary may also provide reasonable and necessary costs incurred for the care

Support:  This development was opposed by the Swazi National Council (liqoqo).
***
Question:  What is another way to refer to the Swazi National Council?
***
Correct:  liqoqo
***
predicted:  development




Support:  Anglican, Protestant and indigenous African churches, including African Zionist, constitute the majority of the Christians (40%), followed by Roman Catholicism at 20% of the population.
***
Question:  What amount of Swazi Christians are Roman Catholic?
***
Correct:  20 %
***
predicted:  anglican, protestant and indigenous african churches




Support:  Low agricultural productivity in the SNLs, repeated droughts, the devastating effect of HIV/AIDS and an overly large and inefficient government sector are likely contributing factors.
***
Question:  How large in square kilometers is Swaziland?
***
Correct:  17,364 km2
***
predicted:  low agricultural productivity




Support:  In 2004, the Swaziland government acknowledged for the first time that it suffered an AIDS crisis, 

Support:  Swaziland's currency, the lilangeni, is pegged to the South African rand.
***
Question:  What form of currency is the lilangeni fixed to?
***
Correct:  south african rand
***
predicted:  swaziland's currency




Support:  In "umchwasho", all young girls were placed in a female age-regiment.
***
Question:  Until what date was Swaziland bound by umchwasho?
***
Correct:  19 august 2005
***
predicted:  young girls




Support:  A campus of Limkokwing University of Creative Technology can be found at Sidvwashini, a suburb of the capital Mbabane.
***
Question:  What is the capital of Swaziland?
***
Correct:  mbabane
***
predicted:  campus




Support:  In 1903, after British victory in the Anglo-Boer war, Swaziland became a British protectorate.
***
Question:  What 1903 conflict involving the British caused Swaziland to become a protectorate?
***
Correct:  anglo-boer war
***
predicted:  british victory




Support:  On the positive side, the external debt burden has declined marked

Support:  Attempts to organize a film festival that had begun in 1954 within the framework of the Golrizan Festival, bore fruits in the form of the Sepas Festival in 1969.
***
Question:  What Iranian film festival was created in 1973?
***
Correct:  tehran world festival
***
predicted:  attempts




Support:  Following the premature death of Alexander, Iran came under the control of the Hellenistic Seleucid Empire.
***
Question:  What sea did the Achaemenid Empire control the majority of the coastal regions of?
***
Correct:  the black sea
***
predicted:  alexander




Support:  The Assembly of Experts elects and dismisses the Supreme Leader on the basis of qualifications and popular esteem.
***
Question:  How often does the Assembly of Experts meet?
***
Correct:  one week annually
***
predicted:  assembly




Support:  Since the 1979 Revolution, to overcome foreign embargoes, Iran has developed its own military industry, produced its own tanks, armored personnel carriers, guided missile

Support:  Iran has leading manufacturing industries in the fields of car-manufacture and transportation, construction materials, home appliances, food and agricultural goods, armaments, pharmaceuticals, information technology, power and petrochemicals in the Middle East.
***
Question:  What type of materials is Iran a leading manufacturer of in the Middle East?
***
Correct:  construction materials
***
predicted:  leading manufacturing industries




Support:  As a result of Ottoman hostilities across the border, a large amount of the Assyrians of Iran were massacred by the Ottoman armies, notably in and around Urmia.
***
Question:  What does Iran border with to its north?
***
Correct:  the caspian sea
***
predicted:  result




Support:  Iran reached the pinnacle of its power during the Achaemenid Empire founded by Cyrus the Great in 550 BC, which at its greatest extent comprised major portions of the ancient world, stretching from parts of the Balkans (Thrace-Macedonia, Bulgaria-Paeon

Support:  Together with their neighboring arch-rival, the Roman-Byzantines, they made up the world's two most dominant powers at the time, for over four centuries.
***
Question:  What was the first world goverment the world had seen at the time it existed?
***
Correct:  the achaemenid empire
***
predicted:  neighboring arch-rival




Support:  After the Revolution of 1979, as the new government imposed new laws and standards, a new age in Iranian cinema emerged, starting with Viva... by Khosrow Sinai and followed by many other directors, such as Abbas Kiarostami and Jafar Panahi.
***
Question:  What was the name of Sinai's film that ushered in Iran's new era of film after the 1979 Revolution?
***
Correct:  viva ...
***
predicted:  revolution of 1979




Support:  Attempts by the Jimmy Carter administration to negotiate for the release of the hostages, and a failed rescue attempt, helped force Carter out of office and brought Ronald Reagan to power.
***
Question:  Who won the 1980 US Pr

Support:  In August 2014, Maryam Mirzakhani became the first-ever woman, as well as the first-ever Iranian, to receive the Fields Medal, the highest prize in mathematics.
***
Question:  In 2014, Maryam Mirzakhani became the first women ever and first Iranian to win what prestigious mathematics award? 
***
Correct:  the fields medal
***
predicted:  fields medal




Support:  According to many of them, the number of ethnic Azerbaijanis in Iran comprises between 21.6–30% of the total population, with the majority holding it on 25%.cd In any case, the largest population of Azerbaijanis in the world live in Iran.
***
Question:  What is the percentage range of Iran's total population is seemingly comprised of Azerbaijanis?
***
Correct:  between 21.6–30 %
***
predicted:  21.6–30 %




Support:  The Persian Campaign commenced furthermore during World War I in Northwestern Iran after an Ottoman invasion, as part of the Middle Eastern Theatre of World War I.
***
Question:  When did the Iran-Iraq

Support:  Attempts to organize a film festival that had begun in 1954 within the framework of the Golrizan Festival, bore fruits in the form of the Sepas Festival in 1969.
***
Question:  What Iranian film festival in 1954 was the progenitor of future film festivals in 1969 and 1973?
***
Correct:  the golrizan festival
***
predicted:  attempts




Support:  However, nationalized industries such as the bonyads have often been managed badly, making them ineffective and uncompetitive with years.
***
Question:  What Iranian nationalized industry has been noncompetitive and managed badly? 
***
Correct:  the bonyads
***
predicted:  nationalized industries




Support:  The empire at its peak ruled over 44% of the world's population, the highest such figure for any empire in history.
***
Question:  Iran has the highest population of what group in the world?
***
Correct:  azerbaijanis
***
predicted:  world's population




Support:  The coastal plains of the Persian Gulf and Gulf of Oman in sou

KeyboardInterrupt: 