

```
# BERT - Portuguese Cased NLI
```



In [None]:
# receives
# .txt file containing a list of question-answer pairs identified by 'P: ' and 'R: '
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective BERT's answers, identified by 'P: ' and 'R: '

In [None]:
pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy
import scipy

In [None]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# BERT - portuguese cased NLI
def bert_model_pt(model_name):
    model = SentenceTransformer(model_name)
    print('Portuguese Cased NLI Model Downloaded')
    return model

def get_vector(question, model):
    vector = model.encode(question)
    return vector

In [None]:
# returns the answer corresponding to the question in position question_index of question-answer pairs file
def get_answer(file_content, question_index):
    answer = ''

    for i in range(question_index + 1, len(file_content)):
        if file_content[i] == '\n' or 'P: ' in file_content[i]:
            break
        else:
            answer = answer + file_content[i]

    return answer

In [None]:
# computes all questions embeddings, and stores them along with the question position in the domain file
def domain_questions_embeddings(domain_content, model):
    all_emb = []

    for i in range(len(domain_content)):
        emb_pos = []

        if 'P: ' in domain_content[i]:
            aux_ques = domain_content[i].replace('P: ', '')
            aux_emb = get_vector(aux_ques, model)

            emb_pos.append(aux_emb)
            emb_pos.append(i)

            all_emb.append(emb_pos)

    return all_emb

In [None]:
# returns position of question in domain most similar to posed question
def compute_similarity(all_emb, ques_emb):
    cos_sim = -1
    pos = 0

    for i in range(len(all_emb)):
        aux_cos_sim = 1 - scipy.spatial.distance.cosine(all_emb[i][0], ques_emb)

        if aux_cos_sim > cos_sim:
            cos_sim = aux_cos_sim
            pos = all_emb[i][1]

    return pos

In [None]:
# retrieves a file with the posed questions and BERT's answers
def bert_portuguese(questions_file_path, domain_file_path, save_file_path):
    final_file_content = []
    questions = open_file(questions_file_path)
    domain_content = open_file(domain_file_path)

    model = bert_model_pt('ricardo-filho/bert-portuguese-cased-nli-assin-assin-2')

    # creates BERT vector representations of all questions in the domain file
    all_emb = domain_questions_embeddings(domain_content, model)

    for i in range(len(questions)):

        # creates BERT vector representations of posed question
        ques_emb = get_vector(questions[i], model)

        # returns position of question in domain most similar to posed question
        best_match_pos = compute_similarity(all_emb, ques_emb)

        # creates the final file containing all posed questions and respective retrieved answers, with 'P: ' and 'R: ' identifiers
        aux_ques = 'P: ' + questions[i]
        aux_ans = get_answer(domain_content, best_match_pos)

        final_file_content.append(aux_ques)
        final_file_content.append(aux_ans)
        final_file_content.append('\n')

    write_file(save_file_path, final_file_content)
    print('File with posed questions and respective answers created!')

In [None]:
# NOTES

# domain_file_path - path to the file containing the domain
# must be a file containing question-answer pairs identified with 'P: ' and 'R: ', respectively
# FAQs                      
# P: question1             
# R: answer1           
# \n                        
# P: question2              
# R: answer2                
# \n                       
# must be a .txt file

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved
# must be a .txt file

In [None]:
domain_file_path = # 'domain_file_path'
questions_file_path = # 'questions_file_path'
save_file_path = # 'save_file_path'

bert_portuguese(questions_file_path, domain_file_path, save_file_path)