

```
# BERT - Clustering
```



In [None]:
# receives
# .txt file containing a list of question-answer pairs identified by 'P: ' and 'R: '
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective BERT's answers, identified by 'P: ' and 'R: '

In [None]:
pip install transformers

In [None]:
pip install pandas

In [None]:
pip install sklearn

In [None]:
pip install kneed

In [5]:
import transformers
from transformers import pipeline
import numpy
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import kneed
import math
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text  import TfidfVectorizer
from numpy import savetxt
from numpy import loadtxt

In [6]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [13]:
# context management
def format_context(context_list):
    context = ''

    for i in range(len(context_list)):
        context = context + context_list[i]

    return context

def get_context(context_questions, context_index):
    context = ''

    for i in range(len(context_index)):
        context = context + context_questions[context_index[i]]

    return context

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# BERT - feature extraction pipeline
def bert_model_fe(model_name, pipeline_name):
    pipe_feat_extraction = pipeline(pipeline_name, model=model_name)
    return pipe_feat_extraction

def get_vector(question, pipe_feat_extraction):
    vector = pipe_feat_extraction(question)
    result = vector[0][0]

    return result

pipe_feat_extraction = bert_model_fe('neuralmind/bert-large-portuguese-cased', "feature-extraction")
print('Feature Extraction Model Downloaded')

In [None]:
# BERT - question answering pipeline
def bert_model_qa(model_name, pipeline_name):
    pipe_ques_answering = pipeline(pipeline_name, model=model_name)
    return pipe_ques_answering

def get_answer_qa(context, question, pipe_ques_answering):
    result = pipe_ques_answering(question=question, context=context)

    return result

pipe_ques_answering = bert_model_qa('pierreguillou/bert-base-cased-squad-v1.1-portuguese', "question-answering")
print('Question Answering Model Downloaded')

In [9]:
# returns the answer corresponding to the question in position question_index of question-answer pairs file
def get_answer(file_content, question_index):
    answer = ''

    for i in range(question_index + 1, len(file_content)):
        if file_content[i] == '\n' or 'P: ' in file_content[i]:
            break
        else:
            answer = answer + file_content[i]

    return answer

In [10]:
# returns clusters and respective centroids after performing k-means clustering
def km_cluster(data):
    sum_squared_distances = []

    # find optimal number of clusters
    aux_k = range(1,100)
    for num_clusters in aux_k:
        kmeans = KMeans(n_clusters=num_clusters)
        kmeans.fit(data)
        sum_squared_distances.append(kmeans.inertia_)

    best_k = kneed.KneeLocator(range(1, 100), sum_squared_distances, curve="convex", direction="decreasing")
    print('Creating ' + str(best_k.elbow) + ' clusters.')

    # perform k-means
    kmeans = KMeans(best_k.elbow)
    kmeans.fit(data)

    # return clusters and centroids
    identified_clusters = kmeans.fit_predict(data)
    centroids  = kmeans.cluster_centers_

    return identified_clusters, centroids

In [11]:
# creates files with clusters and centroids from k-means clustering
def bert_clustering(domain_file_path, clustering_file_path, centroids_file_path, pipe_feat_extraction):
    data = []
    domain_content = open_file(domain_file_path)
    
    for line in domain_content:
        if 'P: ' in line:
            question = line.replace('P: ', '')
            ques_vector = get_vector(question, pipe_feat_extraction)
            data.append(ques_vector)

    # k-means clustering, returns centroids and clusters
    (identified_clusters, centroids) = km_cluster(data)

    # save clusters and centroids to csv file
    savetxt(clustering_file_path, identified_clusters, delimiter=',')
    savetxt(centroids_file_path, centroids, delimiter=',')

    print('Clusters Formed and Centroids Calculated!')

In [14]:
# retrieves a file with the posed questions and BERT's answers
def bert_clus_answers(clustering_file_path, centroids_file_path, domain_file_path, questions_file_path, saving_file_path, pipe_feat_extraction, pipe_ques_answering):
    final_file_content = []
    questions = open_file(questions_file_path)
    faqs_list = open_file(domain_file_path)

    # creates files with clusters and centroids from k-means clustering 
    bert_clustering(domain_file_path, clustering_file_path, centroids_file_path, pipe_feat_extraction)

    # gets files with clusters and centroids from k-means clustering 
    identified_clusters = loadtxt(clustering_file_path, delimiter=',')
    centroids = loadtxt(centroids_file_path, delimiter=',')
    
    for i in range(len(questions)):
        cos = 0
        context = []
        centroid_id = -1

        # create question embedding
        ques_emb = get_vector(questions[i], pipe_feat_extraction)

        # compare with centroids
        for j in range(len(centroids)):

            # calculate cosine between question and centroid
            aux_centroids = centroids[j]
            aux_cos = 1 - scipy.spatial.distance.cosine(centroids[j], ques_emb)

            # if greater cosine than last, update centroid
            if aux_cos >= cos:
                cos = aux_cos

                # closest centroid is the respective cluster number
                centroid_id = j

        # go to clusters array and save the positions of respective centroid faqs
        faqs_pos = []
        for m in range(len(identified_clusters)):
            if math.ceil(identified_clusters[m]) == centroid_id:
                faqs_pos.append(m)
        
        # form the context: from questions positions, grab question and answer in domain file
        question_pos = 0
        for n in range(len(faqs_list)):
            if 'P: ' in faqs_list[n]:
                for pos in faqs_pos:
                    if question_pos == pos:
                        context.append(faqs_list[n])
                        aux_a = get_answer(faqs_list, n)
                        context.append(aux_a)
                        context.append('\n')
                        continue
                question_pos += 1

        # get BERT-QA answer, with cluster as context
        str_context = format_context(context)
        answer = get_answer_qa(str_context, questions[i], pipe_ques_answering)

        # creates the final file containing all posed questions and respective retrieved answers, with 'P: ' and 'R: ' identifiers
        aux_ques = 'P: ' + questions[i]
        final_file_content.append(aux_ques)
        aux_ans = 'R: ' + answer['answer']
        final_file_content.append(aux_ans)
        final_file_content.append('\n\n')

    write_file(saving_file_path, final_file_content)
    print('File with posed questions and respective answers created!')

In [None]:
# NOTES

# domain_file_path - path to the file containing the domain
# must be a file containing question-answer pairs identified with 'P: ' and 'R: ', respectively
# FAQs                      
# P: question1             
# R: answer1           
# \n                        
# P: question2              
# R: answer2                
# \n                       
# must be a .txt file

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved
# must be a .txt file

# clustering_file_path - path to the file where the clustering results are to be saved
# must be a .txt file

# centroids_file_path - path to the file where the centroids are to be saved
# must be a .txt file

In [None]:
domain_file_path = # 'domain_file_path'
questions_file_path = # 'questions_file_path'
save_file_path = # 'save_file_path'

clustering_file_path = # 'clustering_file_path'
centroids_file_path = # 'centroids_file_path'

bert_clus_answers(clustering_file_path, centroids_file_path, domain_file_path, questions_file_path, save_file_path, pipe_feat_extraction, pipe_ques_answering)