In [1]:
import nltk
import json
import spacy
from nltk.corpus import stopwords
from math import log
from collections import defaultdict, Counter
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer

# Variables

In [2]:
OPEN_QUESTION_WORDS = ['what','who','whose','whom','where','when','why','how',
                       'which',"what's","who's","where's","how's"]
CLOSED_QUESTION_WORDS = ['is','are','am','was','were','do','does,','did','can',
                         'could','will','would','shall','should','have','has',
                         'had']

# Stop words
stop = set(stopwords.words('english'))

lmtz = WordNetLemmatizer()

with open('testing.json') as json_data:
    test = json.load(json_data)

with open('documents.json') as json_data:
    documents = json.load(json_data)

# Spacy toolkit
nlp = spacy.load('en_core_web_sm')

punc = set(punctuation)

In [3]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punc)

In [4]:
def lemmatize(token):
    lemma = lmtz.lemmatize(token, 'v')
    if lemma == token:
        lemma = lmtz.lemmatize(token, 'n')
    return lemma


def extract_term_freqs(doc):
    tfs = {}
    for token in nltk.word_tokenize(doc):
        lemma = lemmatize(token.lower())
        if lemma not in stop and lemma.isalpha():
            tfs[lemma] = tfs.get(lemma, 0) + 1
    return tfs


def compute_doc_freqs(doc_term_freqs):
    dfs = Counter()
    for tfs in doc_term_freqs.values():
        for term in tfs.keys():
            dfs[term] += 1
    return dfs


def query_vsm(query, index, k=20):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator.most_common(k)


# Find the question word
def get_qword(question):
    tokens = nltk.word_tokenize(question.lower())
    for token in tokens:
        if token in OPEN_QUESTION_WORDS:
            return token
    for token in tokens:
        if token in CLOSED_QUESTION_WORDS:
            return token
    return 'others'

In [5]:
# length of longest same sequences of keywords
def get_overlap(sent1, sent2):
    tokens1 = []
    tokens2 = []

    for token in nltk.word_tokenize(strip_punctuation(sent1.lower())):
        lemma = lemmatize(token)
        if lemma not in stop:
            tokens1.append(lemma)

    for token in nltk.word_tokenize(strip_punctuation(sent2.lower())):
        lemma = lemmatize(token)
        if lemma not in stop:
            tokens2.append(lemma)

    max = 0
    for i in range(len(tokens1)):
        for j in range(len(tokens2)):

            if tokens1[i] == tokens2[j]:
                length = 1

                ii = i + 1
                jj = j + 1
                while ii < len(tokens1) and jj < len(tokens2) and \
                        tokens1[ii] == tokens2[jj]:
                    ii += 1
                    jj += 1
                    length += 1

                if length > max:
                    max = length

    return max

# Write to test file

In [10]:
file = open('high.csv', 'w')
file.write('id,answer\n')

case_count = 0
# test = [test[17]]
for test_case in test:
    question = test_case['question']
    docid = test_case['docid']

    # Convert doc into one string, then tokenize sentences
    corpus = ''
    for para in documents[docid]['text']:
        corpus += para + ' '

    # sentence as a document
    raw_docs = nltk.sent_tokenize(corpus)

    # TFIDF
    doc_term_freqs = {}
    for (id, raw_doc) in enumerate(raw_docs):
        term_freqs = extract_term_freqs(raw_doc)
        doc_term_freqs[id] = term_freqs
    M = len(doc_term_freqs)

    doc_freqs = compute_doc_freqs(doc_term_freqs)

    vsm_inverted_index = defaultdict(list)
    for docid, term_freqs in doc_term_freqs.items():
        N = sum(term_freqs.values())
        length = 0

        # find tf*idf values and accumulate sum of squares
        tfidf_values = []
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(doc_freqs[term]))
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # inversion of the indexing, term -> (doc_id, score)
            vsm_inverted_index[term].append([docid, tfidf / length])

    for term, docids in vsm_inverted_index.items():
        docids.sort()

    terms = extract_term_freqs(question)
    results = query_vsm(terms, vsm_inverted_index)


    # Step 2
    # Analyse question type
    qword = get_qword(question)

    # the word after question word, such as 'what value', 'which gender'
    next_token = ''

    qtype = ''

    # dependency parsing
    dep = ''

    # head word
    head = ''

    # head dependency
    head_dep = ''

    # subject, root, object
    nsubj = ''
    ROOT = ''
    dobj = ''

    # yes or no questions have two options
    closed_q_choices = ('', '')

    doc = nlp(question)

    tokens = nltk.word_tokenize(question.lower())

    # get next word
    if qword in tokens:
        if tokens.index(qword) < len(tokens) - 1:
            next_token = tokens[tokens.index(qword) + 1]

    # get structure of sentence
    for token in doc:
        if 'nsubj' in token.dep_:
            nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            dobj = lemmatize(strip_punctuation(token.text))

    # for noun (phrase) questions, get answer dependency
    for chunk in doc.noun_chunks:
        if qword in chunk.text:
            dep = chunk.root.dep_
            head = lemmatize(strip_punctuation(chunk.root.head.text))
            head_dep = chunk.root.head.dep_

    # determine answer type
    if 'stand for' in question or 'abbreviat' in question:
        qtype = 'abrv'

    elif qword in ['who',"who's",'whom','whose']:
        qtype = 'who'

    elif qword == 'when':
        qtype = 'when'

    elif qword in ['where',"where's"]:
        qtype = 'where'

    elif qword in ['how',"how's"]:
        if next_token == 'much':
            qtype = 'MONEY'
        elif next_token == 'many':
            qtype = 'CARDINAL'
        elif next_token == 'long':
            qtype = 'DATE'
        elif next_token in ['far','big','wide','deep','tall','high','fast','heavy']:
            qtype = 'QUANTITY'
        elif next_token in ['old','young']:
            qtype = 'DATE'
        elif next_token in ['does','did','do','have','has','had','should',
                              'can','could','will','would','must']:
            if dobj != '':
                qtype = 'adj'
            else:
                qtype = 'verb'

    elif qword in ['what', "what's", 'which']:

        if 'year'in tokens or \
                'day' in tokens or \
                'month' in tokens or \
                'era' in tokens or \
                'age' in tokens or \
                'century' in tokens or \
                'week' in tokens or \
                'period' in tokens or \
                'dynasty' in tokens:
            qtype = 'DATE'

        elif 'company' in tokens or \
                'organization' in tokens or \
                'organisation' in tokens or \
                'corporation' in tokens or \
                'institution' in tokens or \
                'university' in tokens or \
                'corporation' in tokens or \
                'association' in tokens or \
                'union' in tokens or \
                'agency' in tokens:
            qtype = 'ORG'

        elif 'city' in tokens or \
                'country' in tokens or \
                'state' in tokens or \
                'province' in tokens or \
                'county' in tokens:
            qtype = 'GPE'

        elif 'place' in tokens or \
                'river' in tokens or \
                'mountain' in tokens or \
                'ocean' in tokens or \
                'region' in tokens or \
                'area' in tokens or \
                'sea' in tokens or \
                'lake' in tokens or \
                'continent' in tokens or \
                'location' in tokens or \
                'forest' in tokens or \
                'jungle' in tokens:
            qtype = 'LOC'

        elif 'nationality' in tokens:
            qtype = 'NORP'

        elif 'building' in tokens or \
            'airport' in tokens or \
            'highway' in tokens or \
            'bridge' in tokens or \
            'harbour' in tokens or \
            'harbor' in tokens or \
            'port' in tokens or \
            'dam' in tokens:
            qtype = 'FACILITY'

        elif 'hurricane' in tokens or \
            'battle' in tokens or \
            'war' in tokens:
            qtype = 'EVENT'

        elif 'book' in tokens or \
            'novel' in tokens or \
            'song' in tokens or \
            'music' in tokens or \
            'painting' in tokens:
            qtype = 'WORK_OF_ART'

        elif 'language' in tokens or \
                'speak' in tokens:
            qtype = 'LANGUAGE'

        elif 'percentage' in tokens or 'percent' in tokens:
            qtype = 'PERCENT'

        elif 'value' in tokens or \
                'distance' in tokens or \
                'size' in tokens or \
                'length' in tokens or \
                'depth' in tokens or \
                'height' in tokens or \
                'density' in tokens or \
                'speed' in tokens or \
                'weight' in tokens or \
                'area' in tokens or \
                'temperature' in tokens or \
                'volume' in tokens:
            qtype = 'QUANTITY'

        elif 'number' in tokens:
            qtype = 'CARDINAL'

        elif 'price' in tokens:
            qtype = 'MONEY'

        else:
            # what...do type question
            tokens.remove(next_token)
            if 'do' in tokens:
                qtype = 'verb'
            else:
                qtype = 'noun'

    elif qword == 'why':
        qtype = 'why'

    elif qword in CLOSED_QUESTION_WORDS:
        qtype = 'closed'

        # answer is one of the 'or' options in the question
        if 'or' in tokens:
            index = tokens.index('or')
            prev1 = tokens[index - 1]
            next1 = tokens[index + 1]
            tag_tokens = nltk.pos_tag(tokens)

            tag = tag_tokens[index - 1][1]

            # if answer is a noun
            if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
                for chunk in doc.noun_chunks:
                    if prev1 in chunk.text:
                        first = chunk.text
                    if next1 in chunk.text:
                        second = chunk.text
                closed_q_choices = (first, second)
            else:
                closed_q_choices = (prev1, next1)
        else:
            qtype = 'others'

    # re-rank the 20 sentences
    scores = {}
    for id, _ in results:
        sent = raw_docs[id]
        doc = nlp(sent)

        score = get_overlap(sent, question)

        if qtype == 'who':
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    score += 1

        elif qtype == 'when':
            for ent in doc.ents:
                if ent.label_ == 'TIME' or ent.label_ == "DATE":
                    score += 1

        elif qtype == 'where':
            for ent in doc.ents:
                if ent.label_ == 'GPE' or ent.label_ == "LOC":
                    score += 1

        elif qtype in ['LANGUAGE','WORK_OF_ART','EVENT','NORP','FACILITY',
                       'GPE','DATE','TIME','PERCENT','QUANTITY','CARDINAL',
                     'MONEY','PERSON','ORG','LOC']:
            for ent in doc.ents:
                if ent.label_ == qtype:
                    score += 1

        elif qtype == 'adj':
            for token in doc:
                if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                    score += 1

        elif qtype == 'verb':
            for token in doc:
                if token.dep_ == 'ROOT':
                    score += 1

        elif qtype == 'closed':
            first = closed_q_choices[0]
            second = closed_q_choices[1]

            score += (first in sent) + (second in sent)

        elif qtype == 'why':
            if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:
                score += 1

        scores[id] = score

    rank = {}
    for id, sim in results:
        max_score = scores[max(scores, key=scores.get)]
        if max_score != 0:
            rank[id] = sim * 0.5 + (scores[id] / max_score * 0.5)
        else:
            rank[id] = sim
    
    # sentence with highest rank
    index = max(rank, key=rank.get)
    sent = raw_docs[index]
    doc = nlp(sent)

    # find sentence structure
    sent_nsubj = ''
    sent_ROOT = ''
    sent_dobj = ''
    for token in doc:
        if 'nsubj' in token.dep_:
            sent_nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            sent_ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            sent_dobj = lemmatize(strip_punctuation(token.text))
            
    # find answer with highest score
    max_score = -1
    answer = ''
    
    if qtype == 'who':
        for np in doc.noun_chunks:
            score = 0
            
            if np in doc.ents:
                for ent in doc.ents:
                    if np.text in ent.text and ent.label_ == 'PERSON':
                            score += 3

            # find NP dependency
            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1

            if np.text.lower() == 'it':
                score = -1
                
            if score > max_score:
                max_score = score
                answer = np.text

    elif qtype == 'when':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'TIME' or ent.label_ == "DATE":
                score += 3
                
            if ent.text not in question:
                score += 1
            
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'where':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'GPE' or ent.label_ == "LOC":
                score += 3

            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text
            
    elif qtype in ['LANGUAGE', 'WORK_OF_ART', 'EVENT', 'NORP', 'FACILITY',
                   'GPE', 'DATE', 'TIME', 'PERCENT', 'QUANTITY', 'CARDINAL',
                   'MONEY', 'PERSON', 'ORG', 'LOC']:
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == qtype:
                score += 3
            
            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'abrv':
        abrv = ''
        qdoc = nlp(question)
        for token in qdoc:
            text = token.text
            if len(text) >= 2 and text.isupper() and text.isalpha():
                abrv = text.lower()

        if abrv == '' and 'stand for' in question:
            tokens = question.lower().split(' ')
            abrv = tokens[tokens.index('stand')-1]

        if abrv != '':
            tokens = nltk.word_tokenize(sent)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    phrase = token.lower()
                    initials = phrase[0]

                    while i+k < len(tokens) and tokens[i+k][0].isupper():
                        phrase = phrase + ' ' + tokens[i+k].lower()
                        initials += tokens[i+k][0].lower()
                        k += 1

                    phrase = phrase.strip()
                    if initials == abrv:
                        answer = phrase

        else:
            tokens = nltk.word_tokenize(question)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    initials = token[0].lower()

                    while i + k < len(tokens) and tokens[i + k][0].isupper():
                        initials += tokens[i + k][0].lower()
                        k += 1

                    if len(initials) >= 2:
                        answer = initials

    elif qtype == 'adj':
        for token in doc:
            score = 0
            
            if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                score += 3

            token_dep = token.dep_
            token_head = lemmatize(strip_punctuation(token.head.text))
            token_head_dep = token.head.dep_

            if token_dep == dep:
                score += 1
            if token_head == head:
                score += 1
            if token_head_dep == head_dep:
                score += 1

            if token.text not in question:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if token.text.lower() == 'it':
                score = -1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'verb':
        for token in doc:
            score = 0

            if token.dep_ == 'ROOT':
                score += 1

            if lemmatize(strip_punctuation(token.text)) not in \
                    [lemmatize(strip_punctuation(s)) for s in nltk.word_tokenize(question)]:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'closed':
        first = closed_q_choices[0]
        second = closed_q_choices[1]

        # whether each option appears (and is negates)
        appear1 = False
        appear2 = False
        negate1 = False
        negate2 = False
        neg_count = 0
        tokens = nltk.word_tokenize(raw_docs[id])

        for (index, token) in enumerate(tokens):
            if token == 'not' or "n't" in token:
                neg_count += 1

                if index+1 < len(tokens):
                    if tokens[index+1] == first:
                        negate1 = True
                    if tokens[index+1] == second:
                        negate2 = True

            if token == first:
                appear1 = True
            if token == second:
                appear2 = True

        possible_answer = ''
        if appear1 and not appear2:
            if neg_count % 2 == 1:
                possible_answer = second
            else:
                possible_answer = first

        elif appear2 and not appear1:
            if neg_count % 2 == 0:
                possible_answer = second
            else:
                possible_answer = first

        elif appear1 and appear2:
            if negate1 and not negate2:
                possible_answer = second
            elif negate2 and not negate1:
                possible_answer = first
            else:
                possible_answer = second

        if possible_answer != '':
            score += 5
            if score > max_score:
                max_score = score
                answer = possible_answer

    elif qtype == 'why':

        possible_answer = ''
        score = 0

        if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:

            if 'because of' in sent:
                score += 3
                index = sent.index('because of')
                substr = sent[index+11:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break

            elif 'because' in sent:
                score += 3
                index = sent.index('because')
                substr = sent[index + 8:]
                possible_answer = substr

            elif 'due to' in sent:
                score += 3
                index = sent.index('due to')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'reason' in sent:
                score += 2
                index = sent.index('reason')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    index = substr.find('is')
                    if index != -1:
                        possible_answer = substr[index+3]
                    else:
                        index = substr.find('was')
                        if index != -1:
                            possible_answer = substr[index+4]
                        else:
                            possible_answer = sent[sent.index('reason'):]

            elif 'for' in sent:
                score += 1
                index = sent.index('for')
                substr = sent[index + 4:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'since' in sent:
                score += 1
                index = sent.index('since')
                substr = sent[index + 6:]
                possible_answer = substr

            if possible_answer != '' and score > max_score:
                answer = possible_answer
                max_score = score

    # if answer not found, find noun phrases
    if answer == '':
        for np in doc.noun_chunks:
            score = 0

            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1

            if np.text.lower() == 'it':
                score = 0

            if score > max_score:
                max_score = score
                answer = np.text

    file.write(str(case_count))
    file.write(',')
    file.write(strip_punctuation(answer).strip().lower())
    file.write('\n')
    print(case_count,' ',answer)
    case_count += 1

file.close()

0   a combination
1   addition
2   the browser's layout engine
3   Its Opera-mini version
4   late 2004
5   Windows
6   1995
7   Most browsers
8   Marc Andreessen
9   The first web browser
10   competition
11   dominance
12   internet relay chat
13   January
14   Every major web browser
15   January 2003
16   January 2009
17   file transfer protocol
18   Google
19   the case
20   rich user interfaces
21   2002
22   August 2011
23   All major web browsers
24   Chrome's user-base
25   the development
26   December 2011
27   January 2003
28   Netscape
29   Apple's Safari
30   The rapid development
31   The prefix
32   the Mozilla Foundation
33   private networks
34   the Mac
35   a comparison
36   1994
37   a user interface
38   addition
39   Major browsers
40   Information resources
41   live bookmarks
42   a more traditional feed reader
43   Bookmarks
44   year
45   The prefix
46   browser software
47   the File Transfer Protocol
48   Microsoft Corp
49   Mobile Safari
50   Most web brow

436   1066
437   British Gas
438   West Quay
439   2015
440   2
441   parts
442   Eastleigh
443   The Princess Alexandra Dock
444   The vast majority
445   classical concerts
446   The Queen Mary
447   Southampton
448   early May
449   Uni-link
450   cent
451   The third ferry
452   the University of Southampton
453   the 13th century
454   provide
455   1966
456   Hampshire
457   West Quay
458   headquarters
459   The city
460   additional police stations
461   Local train services
462   two
463   UK
464   Solent
465   P&O Cruises
466   27 June 1640
467   The college
468   840
469   the centuries
470   1233
471   University Hospital Southampton NHS Foundation Trust
472   the biggest operator
473   4.2
474   1959
475   354
476   government figures
477   The M27
478   The Talking Heads
479   Hanover Buildings
480   The city
481   December 2007
482   the 12th century
483   traffic congestion
484   Southampton
485   2004
486   1233
487   This built-up area
488   three fire stations
489   

858   100 kPa
859   1843
860   the winds
861   This occultation
862   over 50%
863   The focus
864   The first scientifically useful observation
865   25 August 1989
866   That credit
867   his discovery
868   1821
869   only 0.25%
870   roughly 40
871   The upper-level clouds
872   the Sun
873   The first and so far only object
874   Dark spots
875   Adams
876   17
877   the Le Verrier Ring
878   The discovery
879   Neptune
880   The current most widely accepted explanation
881   Nahuatl
882   2019
883   1:1
884   the penultimate known planet
885   Uranus
886   Neptune
887   2006
888   23
889   High-altitude clouds
890   The first and so far only object
891   14
892   Jupiter
893   The most heavily populated resonance
894   the Le Verrier Ring
895   Sun
896   Voyager 2's arrival
897   high-altitude cloud bands
898   Neptune's atmospheric methane content
899   the months
900   The discovery
901   the evening
902   December 1612
903   These altitudes
904   vortex structures
905   Neptun

1240   1846
1241   many jurisdictions
1242   Other arguments
1243   This definition
1244   three
1245   The term
1246   Infringement
1247   a specific technological problem
1248   these lines
1249   1883
1250   TRIPs
1251   copyright
1252   Copyright infringement
1253   The Recording Industry Association
1254   referring
1255   Stephan Kinsella
1256   the United States
1257   the context
1258   an equitable right
1259   the context
1260   Other recent developments
1261   1791
1262   May 2011
1263   1998
1264   Trade secret misappropriation
1265   2001
1266   October 1845
1267   civil law
1268   1883
1269   one year's
1270   Some critics
1271   17 The stated objective
1272   the Motion Picture Association of America
1273   UK
1274   Trademark infringement
1275   millions
1276   high-tech fields
1277   native cultures
1278   Some critics
1279   intellectual-property rights
1280   native cultures
1281   Berne
1282   the detriment
1283   Infringement
1284   the assumption
1285   He
1286   

1669   the Qing
1670   Li
1671   The remaining Banners
1672   1681
1673   This multi-ethnic force
1674   The Manchus
1675   Qianlong
1676   The terms
1677   1683
1678   Beijing
1679   these military reforms
1680   The Qing dynasty
1681   Qing
1682   The Suiyuan Shidan
1683   1661
1684   1782
1685   1637
1686   full control
1687   The Qing dynasty
1688   Nurhaci
1689   new but not exactly modern Chinese armies
1690   the late 19th century
1691   addition
1692   The emperors
1693   1641
1694   Dorgon's controversial July 1645 edict
1695   Manchu Generals
1696   November
1697   These units
1698   The early Manchu rulers
1699   Qīng Cháo
1700   pro-Japanese Koreans
1701   1870
1702   Yaqub Beg
1703   Wu
1704   the short story form
1705   The move
1706   Manchuria
1707   1711
1708   Guangxu
1709   too few ethnic Manchus
1710   China
1711   The remaining Banners
1712   300 million
1713   The early Manchu rulers
1714   the 1850s
1715   Ningguta
1716   The chief
1717   Jiangyin
1718   1648
171

2075   March 2003
2076   Britain
2077   Queen Elizabeth
2078   World War II
2079   the basement
2080   BBC television
2081   Britain
2082   English
2083   Britain
2084   late 1930
2085   Britain
2086   The BBC Television Service
2087   the other UK nations
2088   the BBC
2089   1967
2090   two
2091   a few years
2092   The BBC Television Service
2093   a few years
2094   The studio
2095   nearly 1000
2096   the pioneering BBC television series
2097   drama
2098   UK
2099   September 1939
2100   late 1930
2101   30
2102   the commissioning
2103   a 40 kilometres
2104   between December 2004
2105   The different remit
2106   BBC Television
2107   the existing VHF 405-line system
2108   World War II
2109   The BBC Television department
2110   daily
2111   a mechanical camera
2112   The BBC
2113   1967
2114   late 1930
2115   Queen Elizabeth
2116   £85 million
2117   June 1932
2118   two
2119   [original research
2120   a DVD
2121   David Attenborough
2122   a series
2123   1967
2124   the

2448   all parties
2449   other cases
2450   the development
2451   First
2452   citation
2453   Al-Waqidi
2454   Each article
2455   international law
2456   binding greenhouse gas emission limits
2457   Brazil
2458   two
2459   Some treaties
2460   A long treaty
2461   any subsequent Act
2462   compacts
2463   state legislation
2464   one
2465   A treaty
2466   Modern treaties
2467   The signatures
2468   The Brazilian federal constitution
2469   entered
2470   the state
2471   political boundaries
2472   unaccepting
2473   international law
2474   The treaty
2475   Congress
2476   the official legal procedures
2477   A party
2478   one
2479   Prior to 1871
2480   the United Nations
2481   a government
2482   Mabo
2483   the UN
2484   case
2485   a preamble
2486   example
2487   The Brazilian federal constitution
2488   a verb
2489   European colonization
2490   Treaties
2491   citation
2492   The contracting parties' full names
2493   The division
2494   its obligations
2495   The S

2830   Mac OS X
2831   Apple's market share
2832   most PCs
2833   the Mac's market share
2834   retrofits
2835   the use
2836   The Air
2837   Apple
2838   Intel
2839   the original
2840   the same year
2841   response
2842   matters
2843   Macintosh
2844   May
2845   well
2846   John Sculley
2847   only one
2848   May
2849   Steve Jobs
2850   1998
2851   October
2852   The Macintosh
2853   the Macintosh II
2854   $2.5 million
2855   Steve Jobs
2856   May 1990
2857   its new iMac
2858   US$1.5 million
2859   all 39 advertising pages
2860   An estimated 100,000
2861   Jobs
2862   2001
2863   John Dvorak
2864   1997
2865   40
2866   the Macintosh platform
2867   August
2868   1989
2869   The second generation
2870   change
2871   the final Macintosh design
2872   mice
2873   Apple
2874   Apple
2875   early 2001
2876   the primary authors
2877   The installed base
2878   The iMac
2879   Finally
2880   The Macintosh's minimal memory
2881   ClarisWorks
2882   2000
2883   a complete office 

3218   around the 7th century AD
3219   His wife
3220   the new plebeian nobility
3221   The customary offers
3222   The fulfillment
3223   household cult
3224   popular belief deities
3225   Some public rituals
3226   the Graeco-Roman world
3227   Roman poets
3228   The rites
3229   the gods
3230   Vesta
3231   lenient
3232   Prodigies
3233   Funeral and commemorative rites
3234   the clitellates
3235   locomotion
3236   Septa
3237   Ragworms' jaws
3238   65 million years ago
3239   Ragworms
3240   The fluid
3241   The brain
3242   "hairs
3243   the clitellates
3244   find
3245   circulates
3246   3
3247   The small shelly fossil Cloudina
3248   North America
3249   Most annelids
3250   found
3251   The frontmost section
3252   Some polychaetes
3253   Cladistic research
3254   The frontmost section
3255   two
3256   Respiratory pigment
3257   two
3258   The difference
3259   the one-cell deep epidermis
3260   mesothelium
3261   two
3262   the sea-floor
3263   mesothelium
3264   Anneli

#  准确率测试

In [11]:
with open('training.json') as json_data:
    train = json.load(json_data)
    
case_count = 0

for train_case in train:
    question = train_case['question']
    docid = train_case['docid']
    correct_answer = train_case['text']

    # Convert doc into one string, then tokenize sentences
    corpus = ''
    for para in documents[docid]['text']:
        corpus += para + ' '

    # sentence as a document
    raw_docs = nltk.sent_tokenize(corpus)

    # TFIDF
    doc_term_freqs = {}
    for (id, raw_doc) in enumerate(raw_docs):
        term_freqs = extract_term_freqs(raw_doc)
        doc_term_freqs[id] = term_freqs
    M = len(doc_term_freqs)

    doc_freqs = compute_doc_freqs(doc_term_freqs)

    vsm_inverted_index = defaultdict(list)
    for docid, term_freqs in doc_term_freqs.items():
        N = sum(term_freqs.values())
        length = 0

        # find tf*idf values and accumulate sum of squares
        tfidf_values = []
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(doc_freqs[term]))
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # inversion of the indexing, term -> (doc_id, score)
            vsm_inverted_index[term].append([docid, tfidf / length])

    for term, docids in vsm_inverted_index.items():
        docids.sort()

    terms = extract_term_freqs(question)
    results = query_vsm(terms, vsm_inverted_index)


    # Step 2
    # Analyse question type
    qword = get_qword(question)

    # the word after question word, such as 'what value', 'which gender'
    next_token = ''

    qtype = ''

    # dependency parsing
    dep = ''

    # head word
    head = ''

    # head dependency
    head_dep = ''

    # subject, root, object
    nsubj = ''
    ROOT = ''
    dobj = ''

    # yes or no questions have two options
    closed_q_choices = ('', '')

    doc = nlp(question)

    tokens = nltk.word_tokenize(question.lower())

    # get next word
    if qword in tokens:
        if tokens.index(qword) < len(tokens) - 1:
            next_token = tokens[tokens.index(qword) + 1]

    # get structure of sentence
    for token in doc:
        if 'nsubj' in token.dep_:
            nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            dobj = lemmatize(strip_punctuation(token.text))

    # for noun (phrase) questions, get answer dependency
    for chunk in doc.noun_chunks:
        if qword in chunk.text:
            dep = chunk.root.dep_
            head = lemmatize(strip_punctuation(chunk.root.head.text))
            head_dep = chunk.root.head.dep_

    # determine answer type
    if 'stand for' in question or 'abbreviat' in question:
        qtype = 'abrv'

    elif qword in ['who',"who's",'whom','whose']:
        qtype = 'who'

    elif qword == 'when':
        qtype = 'when'

    elif qword in ['where',"where's"]:
        qtype = 'where'

    elif qword in ['how',"how's"]:
        if next_token == 'much':
            qtype = 'MONEY'
        elif next_token == 'many':
            qtype = 'CARDINAL'
        elif next_token == 'long':
            qtype = 'DATE'
        elif next_token in ['far','big','wide','deep','tall','high','fast','heavy']:
            qtype = 'QUANTITY'
        elif next_token in ['old','young']:
            qtype = 'DATE'
        elif next_token in ['does','did','do','have','has','had','should',
                              'can','could','will','would','must']:
            if dobj != '':
                qtype = 'adj'
            else:
                qtype = 'verb'

    elif qword in ['what', "what's", 'which']:

        if 'year'in tokens or \
                'day' in tokens or \
                'month' in tokens or \
                'era' in tokens or \
                'age' in tokens or \
                'century' in tokens or \
                'week' in tokens or \
                'period' in tokens or \
                'dynasty' in tokens:
            qtype = 'DATE'

        elif 'company' in tokens or \
                'organization' in tokens or \
                'organisation' in tokens or \
                'corporation' in tokens or \
                'institution' in tokens or \
                'university' in tokens or \
                'corporation' in tokens or \
                'association' in tokens or \
                'union' in tokens or \
                'agency' in tokens:
            qtype = 'ORG'

        elif 'city' in tokens or \
                'country' in tokens or \
                'state' in tokens or \
                'province' in tokens or \
                'county' in tokens:
            qtype = 'GPE'

        elif 'place' in tokens or \
                'river' in tokens or \
                'mountain' in tokens or \
                'ocean' in tokens or \
                'region' in tokens or \
                'area' in tokens or \
                'sea' in tokens or \
                'lake' in tokens or \
                'continent' in tokens or \
                'location' in tokens or \
                'forest' in tokens or \
                'jungle' in tokens:
            qtype = 'LOC'

        elif 'nationality' in tokens:
            qtype = 'NORP'

        elif 'building' in tokens or \
            'airport' in tokens or \
            'highway' in tokens or \
            'bridge' in tokens or \
            'harbour' in tokens or \
            'harbor' in tokens or \
            'port' in tokens or \
            'dam' in tokens:
            qtype = 'FACILITY'

        elif 'hurricane' in tokens or \
            'battle' in tokens or \
            'war' in tokens:
            qtype = 'EVENT'

        elif 'book' in tokens or \
            'novel' in tokens or \
            'song' in tokens or \
            'music' in tokens or \
            'painting' in tokens:
            qtype = 'WORK_OF_ART'

        elif 'language' in tokens or \
                'speak' in tokens:
            qtype = 'LANGUAGE'

        elif 'percentage' in tokens or 'percent' in tokens:
            qtype = 'PERCENT'

        elif 'value' in tokens or \
                'distance' in tokens or \
                'size' in tokens or \
                'length' in tokens or \
                'depth' in tokens or \
                'height' in tokens or \
                'density' in tokens or \
                'speed' in tokens or \
                'weight' in tokens or \
                'area' in tokens or \
                'temperature' in tokens or \
                'volume' in tokens:
            qtype = 'QUANTITY'

        elif 'number' in tokens:
            qtype = 'CARDINAL'

        elif 'price' in tokens:
            qtype = 'MONEY'

        else:
            # what...do type question
            tokens.remove(next_token)
            if 'do' in tokens:
                qtype = 'verb'
            else:
                qtype = 'noun'

    elif qword == 'why':
        qtype = 'why'

    elif qword in CLOSED_QUESTION_WORDS:
        qtype = 'closed'

        # answer is one of the 'or' options in the question
        if 'or' in tokens:
            index = tokens.index('or')
            prev1 = tokens[index - 1]
            next1 = tokens[index + 1]
            tag_tokens = nltk.pos_tag(tokens)

            tag = tag_tokens[index - 1][1]

            # if answer is a noun
            if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
                for chunk in doc.noun_chunks:
                    if prev1 in chunk.text:
                        first = chunk.text
                    if next1 in chunk.text:
                        second = chunk.text
                closed_q_choices = (first, second)
            else:
                closed_q_choices = (prev1, next1)
        else:
            qtype = 'others'

    # re-rank the 20 sentences
    scores = {}
    for id, _ in results:
        sent = raw_docs[id]
        doc = nlp(sent)

        score = get_overlap(sent, question)

        if qtype == 'who':
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    score += 1

        elif qtype == 'when':
            for ent in doc.ents:
                if ent.label_ == 'TIME' or ent.label_ == "DATE":
                    score += 1

        elif qtype == 'where':
            for ent in doc.ents:
                if ent.label_ == 'GPE' or ent.label_ == "LOC":
                    score += 1

        elif qtype in ['LANGUAGE','WORK_OF_ART','EVENT','NORP','FACILITY',
                       'GPE','DATE','TIME','PERCENT','QUANTITY','CARDINAL',
                     'MONEY','PERSON','ORG','LOC']:
            for ent in doc.ents:
                if ent.label_ == qtype:
                    score += 1

        elif qtype == 'adj':
            for token in doc:
                if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                    score += 1

        elif qtype == 'verb':
            for token in doc:
                if token.dep_ == 'ROOT':
                    score += 1

        elif qtype == 'closed':
            first = closed_q_choices[0]
            second = closed_q_choices[1]

            score += (first in sent) + (second in sent)

        elif qtype == 'why':
            if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:
                score += 1

        scores[id] = score

    rank = {}
    for id, sim in results:
        max_score = scores[max(scores, key=scores.get)]
        if max_score != 0:
            rank[id] = sim * 0.5 + (scores[id] / max_score * 0.5)
        else:
            rank[id] = sim
    
    # sentence with highest rank
    index = max(rank, key=rank.get)
    sent = raw_docs[index]
    doc = nlp(sent)

    
    # find sentence structure
    sent_nsubj = ''
    sent_ROOT = ''
    sent_dobj = ''
    for token in doc:
        if 'nsubj' in token.dep_:
            sent_nsubj = lemmatize(strip_punctuation(token.text))
        if token.dep_ == 'ROOT':
            sent_ROOT = lemmatize(strip_punctuation(token.text))
        if 'dobj' in token.dep_:
            sent_dobj = lemmatize(strip_punctuation(token.text))
            
    # find answer with highest score
    max_score = -1
    answer = ''
    
    if qtype == 'who':
        for np in doc.noun_chunks:
            score = 0
            
            if np in doc.ents:
                for ent in doc.ents:
                    if np.text in ent.text and ent.label_ == 'PERSON':
                            score += 3

            # find NP dependency
            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1

            if np.text.lower() == 'it':
                score = -1
                
            if score > max_score:
                max_score = score
                answer = np.text

    elif qtype == 'when':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'TIME' or ent.label_ == "DATE":
                score += 3
                
            if ent.text not in question:
                score += 1
            
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'where':
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == 'GPE' or ent.label_ == "LOC":
                score += 3

            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text
            
    elif qtype in ['LANGUAGE', 'WORK_OF_ART', 'EVENT', 'NORP', 'FACILITY',
                   'GPE', 'DATE', 'TIME', 'PERCENT', 'QUANTITY', 'CARDINAL',
                   'MONEY', 'PERSON', 'ORG', 'LOC']:
        for ent in doc.ents:
            score = 0
            
            if ent.label_ == qtype:
                score += 3
            
            if ent.text not in question:
                score += 1
                
            if score > max_score:
                max_score = score
                answer = ent.text

    elif qtype == 'abrv':
        abrv = ''
        qdoc = nlp(question)
        for token in qdoc:
            text = token.text
            if len(text) >= 2 and text.isupper() and text.isalpha():
                abrv = text.lower()

        if abrv == '' and 'stand for' in question:
            tokens = question.lower().split(' ')
            abrv = tokens[tokens.index('stand')-1]

        if abrv != '':
            tokens = nltk.word_tokenize(sent)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    phrase = token.lower()
                    initials = phrase[0]

                    while i+k < len(tokens) and tokens[i+k][0].isupper():
                        phrase = phrase + ' ' + tokens[i+k].lower()
                        initials += tokens[i+k][0].lower()
                        k += 1

                    phrase = phrase.strip()
                    if initials == abrv:
                        answer = phrase

        else:
            tokens = nltk.word_tokenize(question)
            for (i, token) in enumerate(tokens):
                if token[0].isupper():
                    k = 1
                    initials = token[0].lower()

                    while i + k < len(tokens) and tokens[i + k][0].isupper():
                        initials += tokens[i + k][0].lower()
                        k += 1

                    if len(initials) >= 2:
                        answer = initials

    elif qtype == 'adj':
        for token in doc:
            score = 0
            
            if 'advmod' in token.dep_ or 'acomp' in token.dep_:
                score += 3

            token_dep = token.dep_
            token_head = lemmatize(strip_punctuation(token.head.text))
            token_head_dep = token.head.dep_

            if token_dep == dep:
                score += 1
            if token_head == head:
                score += 1
            if token_head_dep == head_dep:
                score += 1

            if token.text not in question:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if token.text.lower() == 'it':
                score = -1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'verb':
        for token in doc:
            score = 0

            if token.dep_ == 'ROOT':
                score += 1

            if lemmatize(strip_punctuation(token.text)) not in \
                    [lemmatize(strip_punctuation(s)) for s in nltk.word_tokenize(question)]:
                score += 1

            if strip_punctuation(token.text).strip().lower() not in stop:
                score += 1

            if score > max_score:
                max_score = score
                answer = token.text

    elif qtype == 'closed':
        first = closed_q_choices[0]
        second = closed_q_choices[1]

        # whether each option appears (and is negates)
        appear1 = False
        appear2 = False
        negate1 = False
        negate2 = False
        neg_count = 0
        tokens = nltk.word_tokenize(raw_docs[id])

        for (index, token) in enumerate(tokens):
            if token == 'not' or "n't" in token:
                neg_count += 1

                if index+1 < len(tokens):
                    if tokens[index+1] == first:
                        negate1 = True
                    if tokens[index+1] == second:
                        negate2 = True

            if token == first:
                appear1 = True
            if token == second:
                appear2 = True

        possible_answer = ''
        if appear1 and not appear2:
            if neg_count % 2 == 1:
                possible_answer = second
            else:
                possible_answer = first

        elif appear2 and not appear1:
            if neg_count % 2 == 0:
                possible_answer = second
            else:
                possible_answer = first

        elif appear1 and appear2:
            if negate1 and not negate2:
                possible_answer = second
            elif negate2 and not negate1:
                possible_answer = first
            else:
                possible_answer = second

        if possible_answer != '':
            score += 5
            if score > max_score:
                max_score = score
                answer = possible_answer

    elif qtype == 'why':

        possible_answer = ''
        score = 0

        if 'reason' in sent or 'because' in sent or 'due to' in sent or 'since' in sent or 'for' in sent:

            if 'because of' in sent:
                score += 3
                index = sent.index('because of')
                substr = sent[index+11:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break

            elif 'because' in sent:
                score += 3
                index = sent.index('because')
                substr = sent[index + 8:]
                possible_answer = substr

            elif 'due to' in sent:
                score += 3
                index = sent.index('due to')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'reason' in sent:
                score += 2
                index = sent.index('reason')
                substr = sent[index+7:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    index = substr.find('is')
                    if index != -1:
                        possible_answer = substr[index+3]
                    else:
                        index = substr.find('was')
                        if index != -1:
                            possible_answer = substr[index+4]
                        else:
                            possible_answer = sent[sent.index('reason'):]

            elif 'for' in sent:
                score += 1
                index = sent.index('for')
                substr = sent[index + 4:]
                span = nlp(substr)
                for chunk in span.noun_chunks:
                    possible_answer = chunk.text
                    break
                if possible_answer == '':
                    possible_answer = substr

            elif 'since' in sent:
                score += 1
                index = sent.index('since')
                substr = sent[index + 6:]
                possible_answer = substr

            if possible_answer != '' and score > max_score:
                answer = possible_answer
                max_score = score

    # if answer not found, find noun phrases
    if answer == '':
        for np in doc.noun_chunks:
            score = 0

            np_dep = np.root.dep_
            np_head = lemmatize(strip_punctuation(np.root.head.text))
            np_head_dep = np.root.head.dep_

            if np_dep == dep:
                score += 1
            if np_head == head:
                score += 1
            if np_head_dep == head_dep:
                score += 1

            if np.text not in question:
                score += 1

            if strip_punctuation(np.text).strip().lower() not in stop:
                score += 1

            if np.text.lower() == 'it':
                score = 0

            if score > max_score:
                max_score = score
                answer = np.text
    if answer != correct_answer:
        print(case_count)
        print('Question: ',question)
        print('Predicted: ',strip_punctuation(answer).strip().lower())
        print('Correct Answer: ',correct_answer)
#         print('Support sentence: ')
#         for id, _ in results:
#             sent = raw_docs[id]   
#             print(sent+'\n')
        print('\n')
    
    case_count += 1


0
Question:  A kilogram could be definined as having a Planck constant of what value?
Predicted:  several tens
Correct Answer:  6966662606895999999♠6.62606896×10−34 j⋅s


1
Question:  What is the shape of the object that establishes the base unit of the kilogram?
Predicted:  the planck constant
Correct Answer:  cylinder


2
Question:  What example is given as another paired relationship of uncertainly related to standard deviation?
Predicted:  modern terms
Correct Answer:  time vs. energy


3
Question:  What does the Planck Constant refer to?
Predicted:  r2
Correct Answer:  quantum of action


5
Question:  What scientist first studied black body radiation?
Predicted:  max planck
Correct Answer:  kirchhoff


6
Question:  Who helped to give the correct quantization rules for electrons in 1926?
Predicted:  the energy
Correct Answer:  schrödinger


8
Question:  What is required to exist in classical statistics mechanics?
Predicted:  classical statistical mechanics
Correct Answer:  h


9
Qu

63
Question:  What alloy is the base unit of the kilogram made from?
Predicted:  the planck constant
Correct Answer:  platinum–iridium


65
Question:  Who modeled the atom in 1913, challenging Rutherford's model?
Predicted:  niels bohr
Correct Answer:  niels bohr


67
Question:  What is maximized as a result of a black object absorbing all the light that hits it?
Predicted:  the object
Correct Answer:  thermal light emission


68
Question:  What is the energy of a photon?
Predicted:  possible new definitions
Correct Answer:  6981358000000000000♠3.58×10−19 j


69
Question:  What color is hotter than "red hot"?
Predicted:  the colour
Correct Answer:  white hot


70
Question:  Planck studied what problem posed originally by Kirchhoff?
Predicted:  the last years
Correct Answer:  black-body radiation


71
Question:  What is the name for the amount of energy transfered by a wave in a given time?
Predicted:  the energy
Correct Answer:  intensity


72
Question:  Einstein's paper on the photoel

132
Question:  The PV industry has seen drops in module prices since what year?
Predicted:  since 2008
Correct Answer:  2008


133
Question:  As of 2012, what accounts for about half of new nameplate electrical capacity?
Predicted:  renewable energy accounts
Correct Answer:  renewable energy


134
Question:  What is the most widely used form of renewable energy?
Predicted:  16 percent
Correct Answer:  hydroelectricity


135
Question:  In Spain, wind power accounts for what percentage of electricity generated?
Predicted:  approximately 19
Correct Answer:  9 %


136
Question:  Ban Ki-moon states that renewable energy has the ability to lift the poorest nations to new levels of prosperity?
Predicted:  united nations secretarygeneral ban kimoon
Correct Answer:  renewable energy


138
Question:  How much was the total investment in renewable energy in 2012?
Predicted:  244 billion
Correct Answer:  $ 244 billion


139
Question:  What states that EU Member States must ensure that the origin o

210
Question:  What rating did 'Temple of Doom' receive?
Predicted:  spite
Correct Answer:  pg


211
Question:  When will 'The BFG' be in most theaters?
Predicted:  may 2016
Correct Answer:  july 1 , 2016


212
Question:  Who made Spielberg a knight?
Predicted:  spielberg
Correct Answer:  jacques chirac




ValueError: max() arg is an empty sequence