# Project

Team Name:     XiaoJiLingGui

Kaggle Login:  siqiguo

Student Name:  Siqi Guo

Student ID:    743053

Kaggle Login: yifan66

Student Name: Yifan Wang

Student ID: 784386

Python version used: 2.7

In [133]:
import nltk
import json, string
from math import log
from collections import defaultdict, Counter
import csv
from nltk.corpus import wordnet as wn
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger

stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() 
sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')

DOCFNAME = 'project_files/documents.json'

WHAT = 'what'
WHO = 'who'
HOW = 'how'
WHERE = 'where'
WHEN = 'when'
WHICH = 'which'
NAME = 'name'
WHY = 'why'
WHOM = 'whom'
BINARY = 'binary'
POLAR = 'polar'

PUNCTUATION = string.punctuation.translate(None, '$%')

WHAT_CD_WORDS = ['year', 'date', 'percentage', 'value', 'margin']

NER = 0

POS = 1

NOT_WORDS = ['not', "n't"]

REASON_WORDS = ['becauce', 'since', 'due to', 'thanks to', 'owing to', 
                'on account of', 'result', 'for']

HOW_SUBCLASS = {'many': [[], ['CD', 'FW']], 
                'long': [['DURATION'], ['CD', 'FW']], 
                'much': [['MONEY'], ['$', 'CD', 'FW', 'NN']], 
                'far': [[], ['CD', 'FW']], 
                'tall': [[], ['CD', 'FW']], 
                'rich': [['MONEY'], ['$', 'CD', 'FW']], 
                'large': [[], ['CD', 'FW']], 
                }

QUERY_CLASS = {WHAT: [[], ['NN', 'FW']],
               # To optimise, not include WHO - 'ORGANIZATION' in the tag
               WHO: [['PERSON'], ['NN', 'FW']], 
               WHERE: [['LOCATION'], ['NN', 'FW']],
               WHEN: [['DATE', 'TIME'], ['CD', 'NN', 'FW']],
               WHICH: [['PERSON', 'LOCATION', 'DATE', 'TIME', 'ORGANIZATION'], ['NN', 'FW']],
               NAME: [['PERSON', 'LOCATION', 'ORGANIZATION'], ['NN', 'FW']],
               #'why': reason, find "reason word" 
               WHOM: [['PERSON'], ['NN', 'FW']],
               
               # LOCATION, sometimes 'CD' also included as street number, unit number 
               # what: if and not in query terms, then 
              
              }

inverted_index_dict = {} # Store the processed documents' inverted index to improve the effiency

In [134]:
ner_dir = 'stanford-ner-2018-02-27/'
ner_jarfile = ner_dir + 'stanford-ner.jar'
ner_modelfile = ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_filename=ner_modelfile, path_to_jar=ner_jarfile)

pos_dir = 'stanford-postagger-2018-02-27/'
pos_modelfile = pos_dir + 'models/english-bidirectional-distsim.tagger'
pos_jarfile = pos_dir + 'stanford-postagger.jar'
pos_tagger = StanfordPOSTagger(model_filename=pos_modelfile, path_to_jar=pos_jarfile)

In [135]:
def lemmatize_word(word):
    word = word.lower()
#     print word
    adj_word = get_adj(word)
    if adj_word != None:
        return adj_word
    lemma_n = lemmatizer.lemmatize(word, 'n')
    lemma_v = lemmatizer.lemmatize(word, 'v')
    
    # If both change, return the shorter one
    # If only one change, return the changed one
    # If neither change, return the original word
    
    if lemma_n != word and lemma_v != word:    
        if len(lemma_n) < len(lemma_v):
            return lemma_n
        else:
            return lemma_v
    elif lemma_n == word and lemma_v == word:
        return word
    elif lemma_n != word:
        return lemma_n
    elif lemma_v != word:
        return lemma_v

In [136]:
def get_adj(word):
    for syn in wn.synsets(word):
        syn_split = syn.name().split('.')
        if syn_split[0] == word and syn_split[1] == 'r':
            for lemmas in syn.lemmas(): # all possible lemmas
                if lemmas.name() == word:
                    if lemmas.pertainyms() == []:
                        return None
                    return lemmas.pertainyms()[0].name()


In [137]:
# lemmatize_word('lakes')

Select the most relevant paragraph

In [138]:
def extract_query(query):
    tokenized = nltk.word_tokenize(query)
    tokenized = [x.lower() for x in tokenized]
    q_type = None
    
    if WHAT in tokenized:
        q_type = WHAT
                
    elif WHO in tokenized:
        q_type = WHO
        
    elif HOW in tokenized: 
        q_type = HOW
        # Since HOW queries have many answer types, furthrer analysis required
#         how_index = tokenized.index(HOW)
            
#         how_sub = tokenized[how_index + 1]
        
#         if how_sub in HOW_SUBCLASS.keys():
#             tokenized.pop(how_index + 1)
#             q_type = how_sub
#         else:

    elif WHERE in tokenized:
        q_type = WHERE
     
    elif WHEN in tokenized:
        # Use SUTime
        q_type = WHEN
    
    elif WHICH in tokenized:
        q_type = WHICH
        
    elif WHY in tokenized:
        q_type = WHY
    
    elif WHOM in tokenized:
        q_type = WHOM
    
    elif NAME in tokenized:
        q_type = NAME
    else:
        if 'or' in tokenized:
            q_type = BINARY
        else:
            q_type = POLAR
        
    terms = []
    for token in tokenized:
        if token not in stopwords and token not in PUNCTUATION: 
            terms.append(lemmatize_word(token))
            
    return terms, q_type

In [139]:
def select_document(fname, docid):
    with open(fname) as json_data:
        infile = json.load(json_data)
        for doc in infile:
            if doc['docid'] == docid:
                text = doc['text']
    return text
        

def parse_paragraphs(text):
    identifier = 0
    for para in text:
        yield (identifier, para)
        identifier += 1

def extract_term_freqs(para):
    tfs = Counter()
    for token in nltk.word_tokenize(para):
        if token not in stopwords and token not in PUNCTUATION: # 'in' and 'not in' operations are much faster over sets that lists
            tfs[lemmatize_word(token)] += 1
    return tfs

def compute_para_freqs(para_term_freqs):
    pfs = Counter()
    for tfs in para_term_freqs.values():
        for term in tfs.keys():
            pfs[term] += 1
    return pfs

def get_index(para_term_freqs, para_freqs, M):
    vsm_inverted_index = defaultdict(list)
    for pid, term_freqs in para_term_freqs.items():
        N = sum(term_freqs.values())
        length = 0

        # find tf*idf values and accumulate sum of squares 
        tfidf_values = []
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(para_freqs[term]))
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # note the inversion of the indexing, to be term -> (doc_id, score)
            vsm_inverted_index[term].append([pid, tfidf / length])

    # ensure posting lists are in sorted order (less important here cf above)
    for term, pids in vsm_inverted_index.items():
        pids.sort()
    return vsm_inverted_index
    
def query_vsm(query, index, k=4): # Only return the most revelant paragraph
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for pid, weight in postings:
            accumulator[pid] += weight
    return accumulator.most_common(k)

def get_freq_dict(raw_paras):
    para_term_freqs = {}
    for pid, para in raw_paras:
        term_freqs = extract_term_freqs(para)
        para_term_freqs[pid] = term_freqs
    M = len(para_term_freqs)
    return para_term_freqs, M


In [140]:
def get_para(docid, query):
    text = select_document(DOCFNAME, docid)
    raw_paras = list(parse_paragraphs(text))
    if len(raw_paras) == 1: # If there's only one paragraph in the document
        return 0, raw_paras[0][1]
    if docid not in inverted_index_dict.keys(): # check whether the document has been processed before
        para_term_freqs, M = get_freq_dict(raw_paras)
        
        para_freqs = compute_para_freqs(para_term_freqs)
        inverted_index = get_index(para_term_freqs, para_freqs, M)
        inverted_index_dict[docid] = inverted_index
    else:
        inverted_index = inverted_index_dict[docid]
    results = query_vsm(query, inverted_index)
    if results == []:
        return 0, raw_paras[0][1]
    else:
        return results[0][0], raw_paras[results[0][0]][1]

In [141]:
def get_sent(result_para, query):
    sentences = sent_segmenter.tokenize(result_para)
    raw_sents = list(parse_paragraphs(sentences))
    if len(raw_sents) == 1: # If there's only one sentence in the paragraph
        return raw_sents[0][1]
    sent_term_freqs, M = get_freq_dict(raw_sents)
    sent_freqs = compute_para_freqs(sent_term_freqs)
    results = query_vsm(query, get_index(sent_term_freqs, sent_freqs, M))
    if results == []:
        return raw_sents[0][1]
    else:
        return raw_sents[results[0][0]][1]

In [142]:
def get_ans(my_sent, query, q_type):
    
    ans = ''
    
    if q_type == WHAT:
        ans = get_what_ans(my_sent, query)
    elif q_type == WHO:
        ans = get_who_ans(my_sent, query)
    elif q_type == HOW:  
        ans = get_how_ans(my_sent, query)
    elif q_type == WHERE:
        ans = get_where_ans(my_sent, query)
    elif q_type == WHEN:
        ans = get_when_ans(my_sent, query)
    elif q_type == WHICH:
        ans = get_which_ans(my_sent, query)
    elif q_type == WHY:
        ans = get_why_ans(my_sent, query)
    elif q_type == WHOM:
        ans = get_whom_ans(my_sent, query)
    elif q_type == NAME:
        ans = get_name_ans(my_sent, query)
    elif q_type == BINARY:
        ans = get_binary_ans(my_sent, query)
    elif q_type == POLAR:
        ans = get_polar_ans(my_sent, query)  
        
    if ans == '':
        return my_sent
    else:
        return ans



In [143]:
def get_what_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
    
    pos_tags = QUERY_CLASS[WHAT][POS][:]
#     print pos_tags
    
    # special case: "what color"
    if 'color' in query:
        pos_tags.append('JJ')

    # special case: what for an answer include numbers
    for word in query:
#         print pos_tags
        if word in WHAT_CD_WORDS:
            pos_tags.append('CD')
            pos_tags.remove('NN')
            break
    
    ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)

    if 'percentage' in query and '%' in sent_splt_lemma:
        ans_candidate.append(sent_splt_lemma.index('%'))
    
    return get_ans_string(sent_splt, ans_candidate)
    
def get_who_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    ner_candidate = tag_sent(sent_splt, sent_splt_lemma, query, NER)
    
    ner_tags = QUERY_CLASS[WHO][NER][:]
    
    ans_candidate = get_ans_candidate(ner_candidate, ner_tags, NER)
    
    if ans_candidate == []:
        pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
        pos_tags = QUERY_CLASS[WHO][POS][:]
        ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)
        
    return get_ans_string(sent_splt, ans_candidate)
    
def get_where_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    ner_candidate = tag_sent(sent_splt, sent_splt_lemma, query, NER)
    
    ner_tags = QUERY_CLASS[WHERE][NER][:]
    
    ans_candidate = get_ans_candidate(ner_candidate, ner_tags, NER)
    
    if ans_candidate == []:
        pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
        pos_tags = QUERY_CLASS[WHERE][POS][:]
        ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)
        
    return get_ans_string(sent_splt, ans_candidate)
    
def get_when_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    ner_candidate = tag_sent(sent_splt, sent_splt_lemma, query, NER)
    
    ner_tags = QUERY_CLASS[WHEN][NER][:]
    
    ans_candidate = get_ans_candidate(ner_candidate, ner_tags, NER)
    
    if ans_candidate == []:
        pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
        pos_tags = QUERY_CLASS[WHEN][POS][:]
        ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)
        
    return get_ans_string(sent_splt, ans_candidate)
    
def get_which_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    ner_candidate = tag_sent(sent_splt, sent_splt_lemma, query, NER)
    
    ner_tags = QUERY_CLASS[WHICH][NER][:]
    
    ans_candidate = get_ans_candidate(ner_candidate, ner_tags, NER)
    
    if ans_candidate == []:
        pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
        pos_tags = QUERY_CLASS[WHICH][POS][:]
        ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)
        
    return get_ans_string(sent_splt, ans_candidate)

    
def get_why_ans(my_sent, query):
    
    for word in REASON_WORDS:
        if word in my_sent:
            my_sent = my_sent.split(word,1)[1] 
            
    sent_splt = nltk.word_tokenize(my_sent)
    
    # special case - "on one's account" / "on this account"
    if ('account' in sent_splt and 'on' in sent_splt 
        and (sent_splt.index('account') == sent_splt.index('on')+2 or 
             sent_splt.index('account') == sent_splt.index('on')+3)):
        
        sent_splt.remove('account')
        sent_splt.remove('on')
        
    
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    ans_candidate = []
    for i in range(len(sent_splt_lemma)):
        word = sent_splt_lemma[i]
        if (word not in query and word not in stopwords and 
            word not in PUNCTUATION):
            ans_candidate.append(i)

    return get_ans_string(sent_splt, ans_candidate)
    
def get_whom_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)

    if pos_candidate.has_key('IN'):
        prep_index = pos_candidate['IN'].sort()[0]
        sent_splt = sent_splt[prep_index+1:]
        sent_splt_lemma = sent_splt_lemma[prep_index+1:]
        pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)

    ner_candidate = tag_sent(sent_splt, sent_splt_lemma, query, NER)
    
    ner_tags = QUERY_CLASS[WHOM][NER][:]
    
    ans_candidate = get_ans_candidate(ner_candidate, ner_tags, NER)
    
    if ans_candidate == []:
        pos_tags = QUERY_CLASS[WHOM][POS][:]
        ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)
        
    return get_ans_string(sent_splt, ans_candidate)
    
def get_name_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
    
    pos_tags = QUERY_CLASS[NAME][POS][:]
    
    ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS)

    return get_ans_string(sent_splt, ans_candidate)
    
    
def get_binary_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    print sent_splt
    if 'true' in query:
        for word in NOT_WORDS:
            if word in sent_splt:
                return 'false'
        return 'true'
    
    pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
    
    ans_candidate = get_ans_candidate(pos_candidate, [], POS)

    return get_ans_string(sent_splt, ans_candidate)


def get_polar_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
    
    for word in NOT_WORDS:
        if word in sent_splt:
            return 'no'
        return 'yes'

In [144]:
def get_how_ans(my_sent, query):
    sent_splt = nltk.word_tokenize(my_sent)
    sent_splt_lemma = [lemmatize_word(x) for x in sent_splt]
        
#     print sent_splt, sent_splt_lemma
    
    word_after_how = None
    
    for word in HOW_SUBCLASS.keys():
        if word in query:
            word_after_how = word
            break
    
    if word_after_how:
                
        ner_candidate = tag_sent(sent_splt, sent_splt_lemma, query, NER)
        ner_tags = HOW_SUBCLASS[word_after_how][NER]
        ans_candidate = get_ans_candidate(ner_candidate, ner_tags, NER)
        
        
        if ans_candidate == []:

            pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
#             print("POS_candidate")
#             print pos_candidate
            
            pos_tags = HOW_SUBCLASS[word_after_how][POS]
#             print("POS TAGS")
#             print pos_tags
            
            ans_candidate = get_ans_candidate(pos_candidate, pos_tags, POS) 
            
    else:
        pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)

        if pos_candidate.has_key('IN'):
            prep_index = pos_candidate['IN'].sort()[0]
            sent_splt = sent_splt[prep_index+1:]
            sent_splt_lemma = sent_splt_lemma[prep_index+1:]
            pos_candidate = tag_sent(sent_splt, sent_splt_lemma, query, POS)
        
        ans_candidate = get_ans_candidate(pos_candidate, [], POS)
    
#     print ans_candidate
    return get_ans_string(sent_splt, ans_candidate)

def get_ans_candidate(candidate, tags, flag):
    ans = []
    
    for tag in tags:
#         print tag
        if candidate.has_key(tag):
#             print 'yes'
            ans += candidate[tag]
            # flag is NER (=0), break, get the words with one tag
            # flag is POS (=1), not break, get words with all tags
            if not flag:
                break
                
    # flag is NER (=0), if no ans, return []
    # flag is POS (=1), if no ans, return all words regradless of tags
    if flag and ans == []:
        for lst in candidate.values():
            ans += lst
        
    return ans

In [145]:
def output_result(final_ans):
    with open('names.csv', 'w') as csvfile:
        fieldnames = ['id', 'answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for each_dict in final_ans:
            writer.writerow({'id': each_dict['id'], 'answer': each_dict['answer']})
    csvfile.close()

def output_result_train(final_ans):
    with open('names.csv', 'w') as csvfile:
        fieldnames = ['f_score', 'ans', 'my_ans', 'prec', 'recall', 'Q', 'para', 'docid']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for each_dict in final_ans:
            writer.writerow({'f_score': each_dict['f_score'],
                             'ans': each_dict['ans'], 
                             'my_ans': each_dict['my_ans'], 
                             'prec': each_dict['prec'], 
                             'recall': each_dict['recall'], 
                             'Q': each_dict['Q'], 
                             'para': each_dict['para'], 
                             'docid': each_dict['docid']})
    csvfile.close()

In [146]:
def tag_sent(sent_splt, sent_splt_lemma, query, tag_type):
    
    if tag_type == NER:
        tagger = ner_tagger
    else:
        tagger = pos_tagger
    
    tagged = tagger.tag(sent_splt)
    
#     print tagged
    
    candidates = defaultdict(list)
    
    for i in range(len(sent_splt_lemma)):
        word = sent_splt_lemma[i]
#         print word
        if word not in query and word not in stopwords and word not in PUNCTUATION:
#                 print("yes")
                if tag_type == NER:
                    candidates[tagged[i][1]].append(i)
                else:
                    candidates[tagged[i][1][:2]].append(i)
    return candidates
   
def get_ans_string(sent_splt, ans_candidate):
    words = set([sent_splt[i].lower() for i in ans_candidate])
    return ' '.join(words)

**Below: get accuracy**

In [None]:
# def get_para_accuracy(fname):
#     counter = float(0)
#     with open(fname) as json_data:
#         infile = json.load(json_data)
#         length = len(infile)
#         for dic in infile:
#             question = dic['question']
#             ans = dic['text']
#             para = dic['answer_paragraph']
#             docid = dic['docid']    
            
#             query, q_type = extract_query(question)
            
# #             print question[:20], query
            
#             para_ord, result = get_para(docid, query)
            
#             if para_ord == para:
#                 counter += 1

#     return counter/length

# develfname = 'project_files/mytest.json' # Need to change later
# acc = get_para_accuracy(develfname)
# print acc

In [None]:
def get_accuracy_train(fname):
    counter = float(0)
    final_ans = []
    with open(fname) as json_data:
        infile = json.load(json_data)
        length = len(infile)
        for dic in infile:
            
            # training and devel
            question = dic['question']
            ans = dic['text']
            para = dic['answer_paragraph']
            docid = dic['docid']    
            
            query, q_type = extract_query(question)            
            my_para_ord, my_para = get_para(docid, query)
                        
            # my_sent is raw sent in the paragraph
            my_sent = get_sent(my_para, query)
            my_ans = get_ans(my_sent, query, q_type)
            
            # calculate precision, recall and accuracy 
            tp = 0.0
            fp = 0.0
            fn = 0.0
            
            for word in ans:
                if word in my_ans:
                    tp += 1
                else:
                    fn += 1
            for word in my_ans:
                if word not in ans:
                    fp += 1
            
            # prec = TP / (TP + FP) 
            prec = tp / (tp+fp)
            # recall = TP / (TP + FN) 
            recall = tp / (tp+fn)
            if prec+recall == 0:
                f_score = 0
            else:
                f_score = 2*prec*recall / (prec+recall)
            # end
            
            rst_dict = {'f_score': f_score,
                        'ans': ans, 
                        'my_ans': my_ans, 
                        'prec': prec, 
                        'recall': recall, 
                        'Q': question, 
                        'para': para, 
                        'docid': docid}
            
            print("F_SCORE: "+ str(f_score) + ' [ANS] ' + ans + ' [MY_ANS] ' + my_ans + ' [Q] ' + question)
            
            final_ans.append(rst_dict)
    output_result_train(final_ans)
    

develfname = 'project_files/devel_some.json' # Need to change later
acc = get_accuracy_train(develfname)

F_SCORE: 0.740740740741 [ANS] june 16 , 1911 [MY_ANS] 1911 four 16 [Q] On what date did the companies that became the Computing-Tabulating-Recording Company get consolidated?
F_SCORE: 1.0 [ANS] 5 % [MY_ANS] % 5 [Q] What percentage of its desktop PCs does IBM plan to install Open Client on to?
F_SCORE: 0.727272727273 [ANS] 1946 [MY_ANS] 1964 18 1946 [Q] What year did IBM hire its first black salesman?
F_SCORE: 0.122448979592 [ANS] spss [MY_ANS] use agreement weather licensing data channel [Q] IBM made an acquisition in 2009, name it.
F_SCORE: 0.769230769231 [ANS] universal product code [MY_ANS] laboratories research [Q] This IBM invention is known by the acronym UPC, what is the full name?
F_SCORE: 0.545454545455 [ANS] second largest [MY_ANS] terms ibm capitalization revenue u.s. nineteenth market [Q] In 2012 Fortune ranked the largest US firms by number employees, what was IBMs rank?
F_SCORE: 0.666666666667 [ANS] 4,100 gallons [MY_ANS] six 4,100 two one [Q] How many gallons of liquid c

In [None]:
# def get_final_result(fname):
#     counter = float(0)
#     final_ans = []
#     with open(fname) as json_data:
#         infile = json.load(json_data)
#         length = len(infile)
#         for dic in infile:  
            
#             #testing
#             question = dic['question']
#             docid = dic['docid'] 
#             # question id
#             qid = dic['id']
            
#             query, q_type = extract_query(question)
                        
#             my_para_ord, my_para = get_para(docid, query)
                        
#             # my_sent is raw sent in the paragraph
#             my_sent = get_sent(my_para, query)
            
#             my_ans = get_ans(my_sent, query, q_type)
            
#             print qid
            
#             final_ans.append({'id': qid, 'answer': my_ans})
            
#     output_result(final_ans)
    
# develfname = 'project_files/testing.json' # Need to change later
# get_final_result(develfname)