

```
# BERT + Whoosh
```



In [None]:
# receives
# a list of .txt files containing raw text
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective BERT's answers, identified by 'P: ' and 'R: '

In [None]:
pip install transformers

In [None]:
pip install whoosh

In [3]:
import transformers
from transformers import pipeline
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh import qparser

In [4]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [5]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# BERT - question answering pipeline
def bert_model_qa(model_name, pipeline_name):
    pipe_ques_answering = pipeline(pipeline_name, model=model_name)
    return pipe_ques_answering

def get_answer_qa(context, question, pipe_ques_answering):
    result = pipe_ques_answering(question=question, context=context)

    return result

pipe_ques_answering = bert_model_qa('pierreguillou/bert-base-cased-squad-v1.1-portuguese', "question-answering")
print('Question Answering Model Downloaded')

In [7]:
# context management
def format_context(context_list):
    context = ''
    for i in range(len(context_list)):
        context = context + context_list[i]
    return context

In [8]:
# retrieve answer from BERT-QA
def get_bert_answer(context_file_path, question, pipe_ques_answering):
    context_list = open_file(context_file_path)
    text_context = format_context(context_list)
    result = get_answer_qa(text_context, question, pipe_ques_answering)

    return result

In [9]:
# creates an index and writer objects, to add documents to be searched
def create_index_writer(files_paths, files_names, ix_path):

    # n-gram filter(2-3)
    schema = Schema(title = TEXT, path = ID(stored = True), content = NGRAM(minsize = 2, maxsize = 3, stored = True))

    # creates whoosh index
    ix = create_in(ix_path, schema)

    # creates writer object to add documents to be searched
    writer = ix.writer()

    # adds documents to writer object
    for i in range(len(files_names)):

        # get content from file
        aux_content = format_context(files_paths[i])

        # adds a document containing the content of a text file
        writer.add_document(title = files_names[i], path = files_paths[i], content = aux_content)
    writer.commit()

    return ix

In [10]:
 # retrieves the path of the document most similar to the posed question
def most_similar_doc(ix, question):
    path = ''

    with ix.searcher() as searcher:

        # creates query and search objects and finds most similar document
        og = qparser.OrGroup.factory(0.9)
        parser = qparser.QueryParser('content', ix.schema, group=og)
        query = parser.parse(question)

        s = ix.searcher()
        results = s.search(query, limit = 1)

        if len(results) > 0:
            result = results[0]
            path = result["path"]

    return path

In [24]:
# retrieves a file with the posed questions and BERT's answers
def bert_answers_score(files_paths, files_names, ix_path, questions_file_path, save_file_path, pipe_ques_answering):
    questions = open_file(questions_file_path)
    final_file_content = []

    # create whyoosh index
    ix = create_index_writer(files_paths, files_names, ix_path)

    for question in questions:
        # path of the most similar document
        path = most_similar_doc(ix, question)

        aux_q = question.replace('\n', '')
        answer = get_bert_answer(path, aux_q, pipe_ques_answering)

        # creates the final file containing all posed questions and respective retrieved answers, with 'P: ' and 'R: ' identifiers
        aux_q_2 = 'P: ' + question
        final_file_content.append(aux_q_2)
        aux_ans = 'R: ' + answer['answer']
        final_file_content.append(aux_ans)
        final_file_content.append('\n')
        final_file_content.append('\n')


    write_file(save_file_path, final_file_content)
    print('File with posed questions and respective answers created!')

In [None]:
# NOTES

# files_paths - list with paths of files in domain
# files_names - list with names of files in domain
# position of each list must match

# must be a collection of files containing raw text
# Text
# Paragraph1Line1
# Paragraph1Line2
# \n                       
# Paragraph2Line1
# Paragraph2Line2
# \n                        
# must be a .txt file                     

# ix_path - path where the index object is to be saved

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved
# must be a .txt file

In [None]:
files_paths = # ['file_path_1', 'file_path_2']
files_names = # ['files_name_1', 'files_name_2']

ix_path = # 'ix_path'
questions_file_path = # 'questions_file_path'
save_file_path = # 'save_file_path'


bert_answers_score(files_paths, files_names, ix_path, questions_file_path, save_file_path, pipe_ques_answering)

In [None]:
files_paths = ['MEO_SmartHome.txt', 'Modem_Huawei_E1750.txt', 'Placa_ZTE_MF63.txt', 'Placa_ZTE_MF65.txt']
files_names = ['MEO_SmartHome.txt', 'Modem_Huawei_E1750.txt', 'Placa_ZTE_MF63.txt', 'Placa_ZTE_MF65.txt']

ix_path = '/content/'
questions_file_path = 'Test_AlticeText.txt'
save_file_path = 'Results.txt'


bert_answers_score(files_paths, files_names, ix_path, questions_file_path, save_file_path, pipe_ques_answering)