

```
# Universal Sentence Encoder Q&A Retrieval
```



In [None]:
# receives
# .txt file containing a list of question-answer pairs identified by 'P: ' and 'R: '
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective useQA's answers, identified by 'P: ' and 'R: '

In [None]:
!pip install -q "tensorflow-text==2.8.*"
!pip install -q simpleneighbors[annoy]
!pip install -q nltk
!pip install -q tqdm

In [None]:
import json
import nltk
import os
import pprint
import random
import simpleneighbors
import urllib
from IPython.display import HTML, display
from tqdm.notebook import tqdm

import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer

nltk.download('punkt')

In [None]:
# load model
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3"
model = hub.load(module_url)

In [None]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# sentences - list of (question, question + answer) tuples - from finetuning file
# questions - list of (question, '') tuples - from questions file
def create_sentences(finetuning_file):
    ft = open_file(finetuning_file)
    sentences = []
    ques_ans = ''

    for line in ft:
        if 'P: ' in line:
            ques_ans += line
            ques = line

        elif 'R: ' in line:
            ques_ans += line

            sentences.append((ques, ques_ans))
            ques = ''
            ques_ans = ''

        else:
            continue

    return list(sentences)

def create_questions(questions_file):
    ques = open_file(questions_file)
    questions = []

    for line in ques:
        questions.append((line, ''))

    return list(questions)

In [None]:
# nearest neighbor function
def display_nearest_neighbors(index, num_results, query_text, answer_text=None):
    query_embedding = model.signatures['question_encoder'](tf.constant([query_text]))['outputs'][0]
    search_results = index.nearest(query_embedding, n=num_results)
    answer = ''

    for s in search_results:
        answer += s

    return answer

In [None]:
# compute embeddings and build simple_neighbors index
def compute_embeddings(sentences):
    batch_size = 1

    encodings = model.signatures['response_encoder'](input=tf.constant([sentences[0][0]]), context=tf.constant([sentences[0][1]]))
    index = simpleneighbors.SimpleNeighbors(len(encodings['outputs'][0]), metric='angular')

    print('Computing embeddings for %s sentences' % len(sentences))

    slices = zip(*(iter(sentences),) * batch_size)
    num_batches = int(len(sentences) / batch_size)

    for s in tqdm(slices, total=num_batches):
        response_batch = list([r for r, c in s])
        context_batch = list([c for r, c in s])
        encodings = model.signatures['response_encoder'](input=tf.constant(response_batch), context=tf.constant(context_batch))
        
        for batch_index, batch in enumerate(response_batch):
            index.add_one(batch, encodings['outputs'][batch_index])

    index.build()
    print('simpleneighbors index for %s sentences built.' % len(sentences))

    return index


In [None]:
# retrieve answer to given question
def get_answer(question, evaluating_file):
    answer = ''
    eval = open_file(evaluating_file)

    for i in range(len(eval)):
        if eval[i] == question:
            answer = eval[i + 1]
    
    return answer

In [None]:
def retrieve_results(domain_file_path, questions_file_path, evaluating_file_path, save_file_path):
    results = []

    sentences = create_sentences(domain_file_path)
    questions = create_questions(questions_file_path)
    print("%s sentences, %s questions extracted from dataset" % (len(sentences), len(questions)))

    index = compute_embeddings(sentences)
    num_results = 1

    for ques in questions:
        similar_question = display_nearest_neighbors(index, num_results, ques[0], ques[1])

        answer = get_answer(similar_question, evaluating_file_path)
        aux_ques = 'P: ' + ques[0]
        results.append(aux_ques)
        results.append(answer)
        results.append('\n')

    write_file(save_file_path, results)
    print('File with posed questions and respective answers created!')

In [None]:
# NOTES

# domain_file_path - path to the file containing the domain
# must be a file containing question-answer pairs identified with 'P: ' and 'R: ', respectively
# FAQs                      
# P: question1             
# R: answer1           
# \n                        
# P: question2              
# R: answer2                
# \n                       
# must be a .txt file

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# evaluating_file_path - path to the file containing all the questions to be posed and respective answers, to perform evaluation
# must be a file containing question-answer pairs identified with 'P: ' and 'R: ', respectively
# FAQs                      
# P: question1             
# R: answer1           
# \n                        
# P: question2              
# R: answer2                
# \n                       
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved
# must be a .txt file

In [None]:
domain_file_path = # 'domain_file_path'
questions_file_path = # 'questions_file_path'
evaluating_file_path = # 'evaluating_file_path'
save_file_path = # 'save_file_path'

retrieve_results(domain_file_path, questions_file_path, evaluating_file_path, save_file_path)