# Project

Team Name:     XiaoJiLingGui

Kaggle Login:  siqiguo

Student Name:  Siqi Guo

Student ID:    743053

Kaggle Login: yifan66

Student Name: Yifan Wang

Student ID: 784386

Python version used: 2.7

In [22]:
import nltk
import requests, tarfile
import json, string
from math import log
from collections import defaultdict, Counter


stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() 
sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')

DOCFNAME = 'project_files/documents.json'

WHAT = 'what'
WHO = 'who'
HOW = 'how'
WHERE = 'where'
WHEN = 'when'
WHICH = 'which'
NAME = 'name'
WHY = 'why'
WHOM = 'whom'
BINARY = 'binary'
POLAR = 'polar'

HOW_SUBCLASS = {'many': [[], ['CD']], 
                'long': [['DURATION'], ['CD']], 
                'much': [['MONEY'], ['CD', 'FW', 'NN']], 
                'far': [[], ['CD']], 
                'tall': [[], ['CD']], 
                'rich': [['MONEY'], ['CD']], 
                'large': [[], ['CD']], 
                }

QUERY_CLASS = {WHAT: [[], ['NN']], 
               WHO: [['PERSON', 'ORGANIZATION'], ['NN']], 
               WHERE: [['LOCATION'], ['NN']],
               WHEN: [['DATE', 'TIME'], ['CD', 'NN']],
               WHICH: [['PERSON', 'LOCATION', 'DATE', 'TIME', 'ORGANIZATION'], ['NN']],
               NAME: [['PERSON', 'LOCATION', 'ORGANIZATION'], ['NN']],
               #'why': reason, find "reason word" 
               WHOM: [['PERSON', 'ORGANIZATION'], ['NN']], 
               
               # LOCATION, sometimes 'CD' also included as street number, unit number 
               # what: if and not in query terms, then 
              
              }

inverted_index_dict = {} # Store the processed documents' inverted index to improve the effiency

In [23]:
def lemmatize_word(word):
    word = word.lower()
    lemma_n = lemmatizer.lemmatize(word,'n')
    lemma_v = lemmatizer.lemmatize(word,'v')
    
    # If both change, return the shorter one
    # If only one change, return the changed one
    # If neither change, return the original word
    
    if lemma_n != word and lemma_v != word:    
        if len(lemma_n) < len(lemma_v):
            return lemma_n
        else:
            return lemma_v
    elif lemma_n == word and lemma_v == word:
        return word
    elif lemma_n != word:
        return lemma_n
    elif lemma_v != word:
        return lemma_v

Select the most relevant paragraph

In [24]:
def extract_query(query):
    tokenized = nltk.word_tokenize(query)
    tokenized = [x.lower() for x in tokenized]
    q_type = None
    
    if WHAT in tokenized:
        q_type = WHAT
                
    elif WHO in tokenized:
        q_type = WHO
        
    elif HOW in tokenized:  
        # Since HOW queries have many answer types, furthrer analysis required
        how_index = tokenized.index(HOW)
            
        how_sub = tokenized[how_index + 1]
        
        if how_sub in HOW_SUBCLASS.keys():
            tokenized.pop(how_index + 1)
            q_type = how_sub
        else:
            q_type = HOW

    elif WHERE in tokenized:
        q_type = WHERE
     
    elif WHEN in tokenized:
        # Use SUTime
        q_type = WHEN
    
    elif WHICH in tokenized:
        q_type = WHICH
        
    elif WHY in tokenized:
        q_type = WHY
    
    elif WHOM in tokenized:
        q_type = WHOM
    
    elif NAME in tokenized:
        q_type = NAME
    else:
        if 'or' in tokenized:
            q_type = BINARY
        else:
            q_type = POLAR
        
    terms = []
    for token in tokenized:
        if token not in stopwords and token not in string.punctuation: 
            terms.append(lemmatize_word(token))
            
    return terms, q_type 

In [25]:
def select_document(fname, docid):
    with open(fname) as json_data:
        infile = json.load(json_data)
        for doc in infile:
            if doc['docid'] == docid:
                text = doc['text']
    return text

# def extract_query_terms(query, q_type):
#     q_terms = extract_terms(query)
#     # Since HOW queries have many answer types, furthrer analysis required
#     if q_type == 'HOW':
        

def parse_paragraphs(text):
    identifier = 0
    for para in text:
        yield (identifier, para)
        identifier += 1

# def extract_terms(para):
#     terms = set()
#     for token in nltk.word_tokenize(para):
#         if token not in stopwords: # 'in' and 'not in' opera]tions are much faster over sets that lists
#             terms.add(lemmatize_word(token))
#     return terms

def extract_term_freqs(para):
    tfs = Counter()
    for token in nltk.word_tokenize(para):
        if token not in stopwords and token not in string.punctuation: # 'in' and 'not in' operations are much faster over sets that lists
            tfs[lemmatize_word(token)] += 1
    return tfs

def compute_para_freqs(para_term_freqs):
    pfs = Counter()
    for tfs in para_term_freqs.values():
        for term in tfs.keys():
            pfs[term] += 1
    return pfs

def get_index(para_term_freqs, para_freqs, M):
    vsm_inverted_index = defaultdict(list)
    for pid, term_freqs in para_term_freqs.items():
        N = sum(term_freqs.values())
        length = 0

        # find tf*idf values and accumulate sum of squares 
        tfidf_values = []
        for term, count in term_freqs.items():
            tfidf = float(count) / N * log(M / float(para_freqs[term]))
            tfidf_values.append((term, tfidf))
            length += tfidf ** 2

        # normalise documents by length and insert into index
        length = length ** 0.5
        for term, tfidf in tfidf_values:
            # note the inversion of the indexing, to be term -> (doc_id, score)
            vsm_inverted_index[term].append([pid, tfidf / length])

    # ensure posting lists are in sorted order (less important here cf above)
    for term, pids in vsm_inverted_index.items():
        pids.sort()
    return vsm_inverted_index
    
def query_vsm(query, index, k=4): # Only return the most revelant paragraph
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for pid, weight in postings:
            accumulator[pid] += weight
    return accumulator.most_common(k)

def get_freq_dict(raw_paras):
#     paras = {}
#     for pid, p in raw_paras:
#         terms = extract_terms(p)
#         paras[pid] = terms

    para_term_freqs = {}
    for pid, para in raw_paras:
        term_freqs = extract_term_freqs(para)
        para_term_freqs[pid] = term_freqs
    M = len(para_term_freqs)
    return para_term_freqs, M


In [31]:
def get_para(docid, query):
    text = select_document(DOCFNAME, docid)
    raw_paras = list(parse_paragraphs(text))
    if docid not in inverted_index_dict.keys(): # check whether the document has been processed before
#         text = select_document(DOCFNAME, docid)
#         raw_paras = list(parse_paragraphs(text))

        para_term_freqs, M = get_freq_dict(raw_paras)
        para_freqs = compute_para_freqs(para_term_freqs)
        inverted_index = get_index(para_term_freqs, para_freqs, M)
        inverted_index_dict[docid] = inverted_index
    else:
        inverted_index = inverted_index_dict[docid]
    results = query_vsm(query, inverted_index)
    print results
    return raw_paras[results[0][0]][1]

In [27]:
def get_sent(result_para, query):
    sentences = sent_segmenter.tokenize(result_para)
    raw_sents = list(parse_paragraphs(sentences))
    sent_term_freqs, M = get_freq_dict(raw_sents)
    sent_freqs = compute_para_freqs(sent_term_freqs)
    results = query_vsm(query, get_index(sent_term_freqs, sent_freqs, M))
    return raw_sents[results[0][0]][1]

In [32]:
def get_accuracy(fname):
    counter = float(0)
    with open(fname) as json_data:
        infile = json.load(json_data)
        length = len(infile)
        for dic in infile:
            question = dic['question']
            ans = dic['text']
            para = dic['answer_paragraph']
            docid = dic['docid']    
            
            query, q_type = extract_query(question)
            
            print question[:20], query
            
            result = get_para(docid, query)
            
            if result == para:
                counter += 1
    return counter/length

develfname = 'project_files/mytest.json' # Need to change later
acc = get_accuracy(develfname)

What is the literal  [u'literal', u'translation', u'theokotos']
[]


IndexError: list index out of range

In [None]:
# result_para = get_para(0, ['eventually', 'call', 'photon'])
# # print result_para
# sent = get_sent(result_para, ['eventually', 'call', 'photon'])
# print sent