## Question and Answer System
### Submitted By: Anubhav Gupta
### Date: 03/23/2018

In [3]:
import nltk 
import glob
import re
import numpy as np
import os
from nltk.corpus import stopwords

#### Reading documents from the corpus

In [4]:
def read_document_directory(dir_path):
    files=glob.glob(dir_path)
    file_rows=[]
    for file in files:
        file_rows.extend(read_single_text_file(file))
    return file_rows

def read_single_text_file(file_path):
    file=open(file_path, 'r', encoding="latin-1")
    rows = file.readlines()
    for row in rows:
        row = re.sub(r'[^\x00-\x7f]',r'', row) #Remove non-ascii chararacters
    file.close()
    return rows



#### This is a method for future version of this system which will clean the documents before pushing to Elastic Search. It will ensure better search and relevancy scores.

In [5]:
#This is for a future version.
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import sent_tokenize, word_tokenize
from nltk import pos_tag

pos_to_wornet_dict = {
    'JJ': wordnet.ADJ,
    'JJR': wordnet.ADJ,
    'JJS': wordnet.ADJ,
    'RB': wordnet.ADV,
    'RBR': wordnet.ADV,
    'RBS': wordnet.ADV,
    'NN': wordnet.NOUN,
    'NNP': wordnet.NOUN,
    'NNS': wordnet.NOUN,
    'NNPS': wordnet.NOUN,
    'VB': wordnet.VERB,
    'VBG': wordnet.VERB,
    'VBD': wordnet.VERB,
    'VBN': wordnet.VERB,
    'VBP': wordnet.VERB,
    'VBZ': wordnet.VERB,
}

lem = WordNetLemmatizer()
stop = set(stopwords.words('english'))
regex = r'[^A-Za-z0-9\%\$\s]+'

def clean_doc(doc):
    sentences = sent_tokenize(doc)
    
    sent_list = []
    for sentence in sentences:
        #Keep only alphanumerics  
        sentence = re.sub(regex,' ',sentence)
    
        #Tokenize
        sentence_words = word_tokenize(sentence)
    
        #POS tag
        sentence_pos = pos_tag(sentence_words)
    
        #Remove stopwords and lemmatize
        sentence_lemma = []
        for pos_word in sentence_pos:
            if pos_word[0] in stop:
                continue
            if pos_word[1] in pos_to_wornet_dict:
                sentence_lemma.append(lem.lemmatize(pos_word[0], pos_to_wornet_dict[pos_word[1]]).lower())
            else:
                sentence_lemma.append(lem.lemmatize(pos_word[0]).lower())
        sentence_lemma.append('.')
        sent_list.extend(sentence_lemma)
    return ' '.join(sent_list)
   


#### Putting documents into ElasticSearch for indexing. ElasticSearch provides fast search on the text and also provides relevancy score based on Okapi formula when matching with a query of keywords. Thus it handles the retrieval of top N documents from the corpus.

In [6]:
from elasticsearch import Elasticsearch

def getElasticSearchConnection():
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    return es

def createIndexCorpus(rows, force=False):
    es = getElasticSearchConnection()
    if not es.indices.exists(index="docs") or force:
        es.indices.delete(index='docs', ignore=[400, 404])
        #es.indices.delete(index='docs_search', ignore=[400, 404])
        i = 0
        for row in rows:
            i += 1
            es.index(index='docs', doc_type='docs_type', id=i, body={"text":row})
            #es.index(index='docs_search', doc_type='docs_clean_type', id=i, body={"text":clean_doc(row)})
            print(i)
    return es



#### Finally, the code below needs to be executed only once. This segment populates the ElasticSearch index once in the lifetime of the application.

In [7]:
#One time Processing

#Document directories
dir_path_2013 = '2013/*.txt'
dir_path_2014 = '2014/*.txt'
#Extract all rows from all the txt_docs. 1 row == 1 doc
all_rows = read_document_directory(dir_path_2013)
all_rows.extend(read_document_directory(dir_path_2014))
#Perform data cleaning on the rows

#Load the documents into elastic search. If the index already exists, don't create again
es = createIndexCorpus(all_rows, force=False)

#### The following code is generic cleaning function for all the sentences (including questions)

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.chunk import conlltags2tree, tree2conlltags, ne_chunk

pos_to_wornet_dict = {
    'JJ': wordnet.ADJ,
    'JJR': wordnet.ADJ,
    'JJS': wordnet.ADJ,
    'RB': wordnet.ADV,
    'RBR': wordnet.ADV,
    'RBS': wordnet.ADV,
    'NN': wordnet.NOUN,
    'NNP': wordnet.NOUN,
    'NNS': wordnet.NOUN,
    'NNPS': wordnet.NOUN,
    'VB': wordnet.VERB,
    'VBG': wordnet.VERB,
    'VBD': wordnet.VERB,
    'VBN': wordnet.VERB,
    'VBP': wordnet.VERB,
    'VBZ': wordnet.VERB,
}

lem = WordNetLemmatizer()
stop = set(stopwords.words('english'))

def clean_sentence(sentence, keep_stop = {}):
    #Keep only alphanumerics
    regex = r'[^A-Za-z0-9\%\$\s]+'
    sentence = re.sub(regex,' ',sentence)
    
    #Tokenize
    sentence_words = word_tokenize(sentence)
    
    #POS tag
    sentence_pos = pos_tag(sentence_words)
    
    #NER tag
    ne_tree = ne_chunk(sentence_pos)
    iob_tagged = tree2conlltags(ne_tree)
    
    #Remove stopwords and lemmatize
    sentence_lemma = []
    for pos_word in iob_tagged:
        if pos_word[0] in stop:
            if pos_word[0] not in keep_stop:
                continue
        if pos_word[1] in pos_to_wornet_dict:
            sentence_lemma.append((lem.lemmatize(pos_word[0], pos_to_wornet_dict[pos_word[1]]), pos_word[1], pos_word[2], pos_word[0]))
        else:
            sentence_lemma.append((lem.lemmatize(pos_word[0]), pos_word[1], pos_word[2], pos_word[0]))    
    
    #Bigram Iterator
    it = nltk.bigrams(sentence_lemma)
    
    return sentence, sentence_lemma, it


#### This section deals with the following:
1. Determining question type and finding out nltk NER chunker compatible tags
2. Keyword Extraction
3. Document selection and scoring using ElasticSearch to fetch top N documents relevant to the Keywords

In [9]:
############# Document Selection#######################3
wh_words = ['what', 'where', 'who', 'how', 'when', 'whose', 'whom', 'why', 'which']

def extract_question_ner_tags(question):
    sentence, sentence_lemma, it = clean_sentence(question, keep_stop = set(wh_words))
    lower_question = [tag[0].lower() for tag in sentence_lemma]

    id_word = 0
    q_word = 'unknown'
    for idx, word in enumerate(lower_question):
        if word in wh_words:
            id_word = idx
            q_word = word
            break    
    question_ner_tags = []
    if q_word == 'who' or q_word == 'whom' or q_word=='whose':
        question_ner_tags = ['B-PERSON', 'I-PERSON']
    elif q_word == 'which' or q_word == 'what':
        next_word = sentence_lemma[id_word+1][0]
        if  next_word in ['company', 'organization', 'firm']:
            question_ner_tags = ['B-ORGANIZATION', 'I-ORGANIZATION']
        elif next_word in ['person', 'people', 'name', 'man', 'woman']:
            question_ner_tags = ['B-PERSON', 'I-PERSON']
        elif next_word in ['percentage', 'quantity', 'amount', 'price', 'weight', 'measure']:
            question_ner_tags = ['O']
        else:
            question_ner_tags = ['B-NP', "I-NP", 'O']
    elif q_word == 'where':
        question_ner_tags = ['B-GPE', 'I-GPE']
    return question_ner_tags
        
        
def extract_keywords(question):
    q, q_lemma, bi_it = clean_sentence(question)
    lower_question = [tag[0].lower() for tag in q_lemma]
    
    q_lemma_question_removed = []
    for idx, word in enumerate(lower_question):
        if word not in wh_words:
            q_lemma_question_removed.append(q_lemma[idx]) 
            
    keywords = {}
    for word in q_lemma_question_removed:
        word_orig = word[0]
        word_lemma = word[3]
        if word_orig not in keywords:
            keywords[word_orig] = 1
        else:
            keywords[word_orig] = keywords[word_orig] + 1
        if word_lemma != word_orig:
            if word_lemma not in keywords:
                keywords[word_lemma] = 1
            else:
                keywords[word_lemma] = keywords[word_lemma] + 1
    return keywords

def findTopNDocs(keywords, n = 1):
    stri = ""
    for key in keywords.keys():
        stri += key + " "
    es = getElasticSearchConnection()
    return es.search(index="docs", size=n, body={"query": {"match": {'text': stri}}})
    
def flatten_es_to_list(es_type):
    list_docs = []
    for doc in es_type["hits"]["hits"]:
        list_docs.append(doc["_source"]["text"])
    return list_docs

def findRelevantDocs(question, num):
    keywords = extract_keywords(question)
    print(keywords)
    top_docs = findTopNDocs(keywords, num)
    return keywords, top_docs, flatten_es_to_list(top_docs)


#### This section deals with the following:
1. Sentence Extraction from the documents
2. Selecting only the sentences which have matching NER tags determined from the question type
3. Scoring the sentences based on the following criteria:
    * The number of words in the candidate sentence that occur adjacently in both the question and the answer candidate (+)
    * The TF-IDF sum of the number of words that matched between question and answer (+)
    * The TF-IDF sum of the number of question content words that did not match in the answer (-)

In [10]:
#########################Answer Selection########################
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
import math
from operator import itemgetter

def flatten_docs_to_sentences(flatten_docs):
    sentences = []
    for doc in flatten_docs:
        sentences.extend(sent_tokenize(doc))
    return sentences

def score_sentences(sentences, keywords, target_ner_tags):
    results = []
    sent_index = 0
    for idx, sent in enumerate(sentences):
        sent_text, sent_lemma, bi_iterator = clean_sentence(sent)
        #Don't consider sentences which don't have target NER tags
        continue_process = False
        for word_lemma in sent_lemma:
            if word_lemma[2] in target_ner_tags:
                continue_process = True
                break
        if not continue_process:
            continue
        #The number of words in the candidate sentence that occur adjacently in both the question
        #and the answer candidate (+)
        score1 = 0
        for bigr in bi_iterator:
            if bigr[0][0] in keywords and bigr[1][0] in keywords:
                score1 += 1
        
        #- The TF-IDF sum of the number of words that matched between question and answer (+)
        #- The TF-IDF sum of the number of question content words that did not match in the answer (-)
        freq_word = dict()
        for word_lemma in sent_lemma:
            word = word_lemma[0]
            if word in freq_word:
                freq_word[word] = freq_word[word]+1
            else:
                freq_word[word] = 1
        
        matched_tf_idf = 0
        unmatched_tf_idf = 0
        for word_lemma in sent_lemma:
            word = word_lemma[0]
            tf_word = freq_word[word]
            idf_word = math.log(float(len(sentences) - tf_word + 0.5)/float(tf_word + 0.5))
            tf_idf_word = tf_word*idf_word
            if word in keywords:
                matched_tf_idf += tf_idf_word
            else:
                unmatched_tf_idf += tf_idf_word
        if matched_tf_idf > 0:
            results.append((sent, sent_lemma, score1, matched_tf_idf, unmatched_tf_idf, matched_tf_idf-unmatched_tf_idf))
    results.sort(key=itemgetter(2, 3), reverse=True)
    return results


#### This method encapsulates the entire Q&A process using the functions defined above

In [11]:
def get_answer_sentence(question):
    keywords, docs, flatten_docs = findRelevantDocs(question, 5)
    sents = flatten_docs_to_sentences(flatten_docs)
    target_ner_tags = extract_question_ner_tags(question)
    score_sents = score_sentences(sents, keywords, target_ner_tags)
    if len(score_sents) > 0:
        return ("I think the answer might be in: " + score_sents[0][0])
    else:
        return ("I don't know that one!")

#### Test Cases: This section runs the entire system for specific question types. 

In [23]:
def print_question_answer(question):
    print("Question: ", question)
    print(get_answer_sentence(question))

#CEO test cases
print("POC For CEO Question Type:")
print_question_answer("Who is the CEO of Google?")
print_question_answer("Who is the CEO of Facebook?")
print_question_answer("Who is the CEO of Twitter?")
print_question_answer("Who is the CEO of Amazon?")

print("\n\nPOC For company bankrupt Question Type:")
print_question_answer("Which companies went bankrupt in month September of year 2008?")
print_question_answer("Which companies went bankrupt in month may of year 2009?")
print_question_answer("Which companies went bankrupt in month december of year 2001?")

print("\n\nPOC For description type questions:")
print_question_answer("What affects GDP?")
print_question_answer("What affects interest rates?")
print_question_answer("What affects inflation index?")


POC For CEO Question Type:
Question:  Who is the CEO of Google?
{'CEO': 1, 'Google': 1}
I think the answer might be in: Larry Page (CEO of Google) Net Worth: $32.3 billion 4.
Question:  Who is the CEO of Facebook?
{'CEO': 1, 'Facebook': 1}
I think the answer might be in: Mark Zuckerberg, CEO of Facebook, when Facebook's IPO flopped, the talk was that the young Zuckerberg wasn't ready to be a strong leader of a major company.
Question:  Who is the CEO of Twitter?
{'CEO': 1, 'Twitter': 1}
I think the answer might be in: In addition to being Twitter's CEO, Evan Williams was its largest shareholder.
Question:  Who is the CEO of Amazon?
{'CEO': 1, 'Amazon': 1}
I think the answer might be in: Prime members can also borrow more than 700,000 books, listen to one million songs and hundreds of playlists, save unlimited photos and watch tens of thousands of movies and TV episodes including the Golden Globe nominated show from Amazon Studios, Transparent, said Jeff Bezos, founder and CEO of Amazo

#### This method can be run in a server environment.

In [None]:
#We can run the following code in a server environment.
def takeQuestion():
    question = input("Ask me anything: ")
    return question

q = 5
while(q > 0):
    question = takeQuestion()
    print(get_answer_sentence(question))
    q -= 1

#### Again, for a future version, we could use Stanford's NER tagger which is much more robust than the nltk NER tagger. 

In [10]:
#from nltk.tag import StanfordNERTagger
#from nltk.tokenize import word_tokenize

#st = StanfordNERTagger('C:\\Users\\Anubhav\\Desktop\\git\\iems308\\assignment4\\stanford\\stanford-ner-2018-02-27\\classifiers\\english.muc.7class.distsim.crf.ser',
#                       'C:/Users/Anubhav/Desktop/git/iems308/assignment4/stanford/stanford-ner-2018-02-27/stanford-ner.jar',
#                       encoding='utf-8')

#text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

#tokenized_text = word_tokenize(text)
#classified_text = st.tag(tokenized_text)

#print(classified_text)