

```
# Whoosh - Search by Question Only or Question and Answer
```



In [None]:
# receives
# .txt file containing a list of question-answer pairs identified by 'P: ' and 'R: '
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective Whoosh's answers, identified by 'P: ' and 'R: '

In [None]:
pip install whoosh

In [None]:
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh import qparser
from whoosh.analysis import LanguageAnalyzer

In [None]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# get question and answer from question-answer pair file
def get_ques_ans(file_path, ques_ans):
    content = ''
    file_content = open_file(file_path)

    # return question
    if ques_ans == 0: 
        content = file_content[ques_ans]

    # return answer
    elif ques_ans == 1:
        for i in range(1, len(file_content)):
            if file_content[i] == '\n':
                break

            else:
                content = content +  file_content[i]
    
    return content

In [None]:
# creates an index and writer objects, to add documents to be searched
def create_index_writer(config, files_paths, files_names, ix_path):
    
    # default configuration
    if config == 'default':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        question = TEXT, 
        answer = TEXT(stored = True))

    # portuguese language analyzer
    elif config == 'lang_pt':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        question = TEXT(analyzer=LanguageAnalyzer('pt')), 
        answer = TEXT(stored = True))

    # n-gram filter(2-3)
    elif config == 'ngram_3':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        question = NGRAM(minsize = 2, maxsize = 3), 
        answer = TEXT(stored = True))

    # n-gram filter(2-4)
    elif config == 'ngram_4':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        question = NGRAM(minsize = 2, maxsize = 4), 
        answer = TEXT(stored = True))   

    # creates whoosh index
    ix = create_in(ix_path, schema)

    # creates writer object to add documents to be searched
    writer = ix.writer()

    # adds documents to writer object
    for i in range(len(files_paths)):
        # gets question from file with question-answer pair
        aux_question = get_ques_ans(files_paths[i], 0)

        # gets answer from file with question-answer pair
        aux_answer = get_ques_ans(files_paths[i], 1)

        # adds a document containing a question-answer pair to writer object
        writer.add_document(title = files_names[i], path = files_paths[i], question = aux_question, answer = aux_answer)

    writer.commit()

    return ix

In [None]:
# retrieves the answer field of the document most similar to the posed question
def most_similar_doc(question, search_type, ix):
    answer = ''

    with ix.searcher() as searcher:
        og = qparser.OrGroup.factory(0.9)

        # search only by the field question
        if search_type == 'ques':
            parser = qparser.QueryParser('question', ix.schema, group=og)

        # search by both question and answer fields
        elif search_type == 'quesans':
            parser = qparser.MultifieldParser(['question', 'answer'], ix.schema, group=og)

        # creates query and search objects and finds most similar document
        query = parser.parse(question)
        s = ix.searcher()
        results = s.search(query, limit = 1)

        if len(results) > 0:
            result = results[0]
            answer = result["answer"]

    return answer

In [None]:
# retrieves a file with the posed questions and Whoosh's answers
def scored_document_faqs(config, files_paths, files_names, ix_path, questions_file_path, search_type, save_file_path):
    final_file_content = []
    questions = open_file(questions_file_path)

    # creates an index and writer objects, adds documents
    ix = create_index_writer(config, files_paths, files_names, ix_path)

    for ques in questions:
        question = ques.replace('"', '')

        # most similar document's answer
        answer = most_similar_doc(question, search_type, ix)

        # creates the final file containing all posed questions and respective retrieved answers, with 'P: ' and 'R: ' identifiers
        if answer != '':
            aux_q = 'P: ' + question
            final_file_content.append(aux_q)
            final_file_content.append(answer)
            final_file_content.append('\n')

            print(aux_q)
            print(answer)
            print('\n')
            
    write_file(save_file_path, final_file_content)
    print('File with posed questions and respective answers created!')

In [None]:
# divides original domain file in multiple files, one question-answer pair per file
def divide_faqs(domain_file_path, save_domain_path):
    ques_ans_pair = []
    count = 0
    files_paths = []
    files_names = []

    faqs_list = open_file(domain_file_path)

    for i in range(len(faqs_list)):
        if 'P: ' in faqs_list[i]:
            ques_ans_pair.append(faqs_list[i])
        
        if 'R: ' in faqs_list[i]:
            for j in range(i, len(faqs_list)):
                if faqs_list[j] == '\n':
                    break
                else:
                    ques_ans_pair.append(faqs_list[j])

            count += 1
            aux_file_name = 'FAQ' + str(count) + '.txt'
            aux_file_path = save_domain_path + aux_file_name
            files_paths.append(aux_file_path)
            files_names.append(aux_file_name)

            write_file(aux_file_path, ques_ans_pair)
            ques_ans_pair = []

    return files_paths, files_names

In [None]:
# NOTES

# domain_file_path - path to the file containing the domain
# must be a file containing question-answer pairs identified with 'P: ' and 'R: ', respectively
# FAQs                      
# P: question1             
# R: answer1           
# \n                        
# P: question2              
# R: answer2                
# \n                       
# must be a .txt file

# save_domain_path - path where the files containing each one question-answer pair are to be saved
# ix_path - path where the index object is to be saved

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved
# must be a .txt file

# config can be: 
# 'default' - default whoosh configuration
# 'lang_pt' - portuguese language analyzer - converts words to lower-case, removes Portuguese stopwords, and converts words to their stem, following Portuguese rules
# 'ngram_3' - added n-gram filter(2-3)
# 'ngram_4' - added n-gram filter(2-4)

# search_type can be:
# 'ques' - search by question field only
# 'quesans' - search by both question and answer fields

In [None]:
domain_file_path = # 'domain_file_path'
save_domain_path = # 'save_domain_path'

ix_path = # 'ix_path'

questions_file_path = # 'questions_file_path'
save_file_path = # 'save_file_path'

config = # 'default' or 'lang_pt' or 'ngram_3' or 'ngram_4'
search_type = # 'ques' or 'quesans'

(files_paths, files_names) = divide_faqs(domain_file_path, save_domain_path)
scored_document_faqs(config, files_paths, files_names, ix_path, questions_file_path, search_type, save_file_path)