

```
# Whoosh - Highlights
```



In [None]:
# receives
# a list of .txt files containing raw text
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective Whoosh's answers, identified by 'P: ' and 'R: '

In [1]:
pip install whoosh

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 2.7 MB/s 
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


In [2]:
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh import qparser
from whoosh.analysis import LanguageAnalyzer
from whoosh import highlight

In [3]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# get content of file
def get_content(file_path):
    content = ''
    file_cont = open_file(file_path)

    for i in range(len(file_cont)):
        content = content + file_cont[i]

    return content

In [5]:
# creates an index and writer objects, to add documents to be searched
def create_index_writer(config, files_paths, files_names, ix_path):

    # default configuration
    if config == 'default':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        content = TEXT(stored = True))

    # portuguese language analyzer
    elif config == 'lang_pt':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        content = TEXT(analyzer=LanguageAnalyzer('pt'), stored = True))

    # n-gram filter(2-3)
    elif config == 'ngram_3':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        content = NGRAM(minsize = 2, maxsize = 3, stored = True))

    # n-gram filter(2-4)
    elif config == 'ngram_4':
        schema = Schema(title = TEXT(stored = True), 
        path = ID, 
        content = NGRAM(minsize = 2, maxsize = 4, stored = True)) 

    # creates whoosh index
    ix = create_in(ix_path, schema)

    # creates writer object to add documents to be searched
    writer = ix.writer()

    # adds documents to writer object
    for i in range(len(files_names)):

        # get content from file
        aux_content = get_content(files_paths[i])

        # adds a document containing the content of a text file
        writer.add_document(title = files_names[i], path = files_paths[i], content = aux_content)
    writer.commit()

    return ix

In [6]:
# retrieves the highlights of the document most similar to the posed question
def most_similar_doc(ix, question):
    answer = ''

    with ix.searcher() as searcher:

        # creates query and search objects and finds most similar document
        og = qparser.OrGroup.factory(0.9)
        parser = qparser.QueryParser('content', ix.schema, group=og)
        query = parser.parse(question)

        s = ix.searcher()
        results = s.search(query, limit = 1)

        results.formatter = highlight.NullFormatter()
        aux_result = results[0].highlights('content')
        answer = aux_result.replace('\n\n', '\n')

    return answer

In [7]:
# retrieves a file with the posed questions and Whoosh's answers
def scored_document(config, files_paths, files_names, ix_path, questions_file_path, save_file_path):
    final_file_content = []
    questions = open_file(questions_file_path)

    ix = create_index_writer(config, files_paths, files_names, ix_path)

    for question in questions:

        # most similar document's answer
        answer = most_similar_doc(ix, question)

        # creates the final file containing all posed questions and respective retrieved answers, with 'P: ' and 'R: ' identifiers
        aux_q = 'P: ' + question
        aux_a = 'R: ' + answer
        final_file_content.append(aux_q)
        final_file_content.append(aux_a)
        final_file_content.append('\n\n')
    
    write_file(save_file_path, final_file_content)
    print('File with posed questions and respective answers created!')

In [None]:
# NOTES

# files_paths - list with paths of files in domain
# files_names - list with names of files in domain
# position of each list must match

# must be a collection of files containing raw text
# Text
# Paragraph1Line1
# Paragraph1Line2
# \n                       
# Paragraph2Line1
# Paragraph2Line2
# \n                        
# must be a .txt file                     

# ix_path - path where the index object is to be saved

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved
# must be a .txt file

# config can be: 
# 'default' - default whoosh configuration
# 'lang_pt' - portuguese language analyzer - converts words to lower-case, removes Portuguese stopwords, and converts words to their stem, following Portuguese rules
# 'ngram_3' - added n-gram filter(2-3)
# 'ngram_4' - added n-gram filter(2-4)

In [None]:
files_paths = # ['file_path_1', 'file_path_2']
files_names = # ['files_name_1', 'files_name_2']

ix_path = # 'ix_path'
questions_file_path = # 'questions_file_path'
save_file_path = # 'save_file_path'

config = # 'default' or 'lang_pt' or 'ngram_3' or 'ngram_4'

scored_document(config, files_paths, files_names, ix_path, questions_file_path, save_file_path)