## Dependencias

In [2]:
%%capture
!pip install huggingface_hub datasets
!pip install langchain
!pip install openai
!pip install nltk
!pip install chromadb
!pip install unstructured
!pip install pdf2image
!pip install pdfminer
!pip install llmsherpa
!pip install tiktoken
!pip install sentence_transformers

In [None]:
import os

api_key = input("Digite a chave da API da Open AI: ")

os.environ['OPENAI_API_KEY'] = api_key

## Carregas dados

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.pg.unicamp.br/norma/31594/0")
docs = loader.load()

## split RecursiveCharacterTextSplitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=32)
texts_RecursiveCharacterTextSplitter = text_splitter.split_documents(docs)

simple_splited_texts = [x.page_content for x in texts_RecursiveCharacterTextSplitter]

## Split dos textos utilizando nltk

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from langchain.text_splitter import NLTKTextSplitter

text_splitter = NLTKTextSplitter(language = "portuguese", chunk_size=512, chunk_overlap  = 128)

In [None]:
texts_NLTKTextSplitter = text_splitter.split_documents(docs)

nltk_splited_texts = [x.page_content for x in texts_NLTKTextSplitter]

## split utilizando a API sherpa

In [None]:
from llmsherpa.readers import LayoutPDFReader

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_url = "/content/Procuradoria Geral - Normas.pdf"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)

In [None]:
sherpa_texts = [x.to_context_text() for x in doc.chunks()]

## Inicializar banco

In [None]:
import chromadb

chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/NeuralmindChatBot/banco")

In [None]:
collection_ada_RecursiveCharacterTextSplitter = chroma_client.create_collection(
    name="textos_ada_RecursiveCharacterTextSplitter",
    metadata={"hnsw:space": "cosine"}
)

collection_ada_nltk = chroma_client.create_collection(
    name="textos_ada_nltk",
    metadata={"hnsw:space": "cosine"}
)

collection_ada_sherpa = chroma_client.create_collection(
    name="textos_ada_sherpa",
    metadata={"hnsw:space": "cosine"}
)

collection_e5_large = chroma_client.create_collection(
    name="textos_e5_large_nltk",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
collection_ada_RecursiveCharacterTextSplitter = chroma_client.get_collection(name="textos_ada_RecursiveCharacterTextSplitter")

collection_ada_nltk = chroma_client.get_collection(name="textos_ada_nltk")

collection_ada_sherpa = chroma_client.get_collection(name="textos_ada_sherpa")

collection_e5_large = chroma_client.get_collection(name="textos_e5_large_nltk")

## Transformar textos em embeddings

### e5-large

In [None]:
from sentence_transformers import SentenceTransformer
multilingual_e5_large = SentenceTransformer('intfloat/multilingual-e5-large')

In [None]:
e5_texts = [f"passage: {text}" for text in nltk_splited_texts]

In [None]:
embeddings_e5 = multilingual_e5_large.encode(e5_texts)

In [None]:
collection_e5_large.add(
    embeddings = embeddings_e5.tolist(),
    documents = e5_texts,
    ids=[str(i) for i in range(len(embeddings_e5))]
)

### Ada - Open IA

In [None]:
from langchain.embeddings import OpenAIEmbeddings

ada_model = OpenAIEmbeddings(model_kwargs = {"model_name":"text-embedding-ada-002"})

In [None]:
embeddings_ada = ada_model.embed_documents(simple_splited_texts)

In [None]:
embeddings_ada_nltk = ada_model.embed_documents(nltk_splited_texts)

In [None]:
embeddings_ada_sherpa = ada_model.embed_documents(sherpa_texts)

In [None]:
collection_ada_RecursiveCharacterTextSplitter.add(
    embeddings = embeddings_ada,
    documents = simple_splited_texts,
    ids=[str(i) for i in range(len(simple_splited_texts))]
)

In [None]:
collection_ada_nltk.add(
    embeddings = embeddings_ada_nltk,
    documents = nltk_splited_texts,
    ids=[str(i) for i in range(len(nltk_splited_texts))]
)

In [None]:
collection_ada_sherpa.add(
    embeddings = embeddings_ada_sherpa,
    documents = sherpa_texts,
    ids=[str(i) for i in range(len(embeddings_ada_sherpa))]
)

## Retriever

In [None]:
class CustomRetriver:
  def __init__(self, collection, embeddingModel, modelName):
    self.collection = collection
    self.modelName = modelName
    self.model = embeddingModel

  def embed_query(self, query):
    if(self.modelName == "e5_large"):
      queryText = f"query: {query}"
      embeddedQuery = self.model.encode([queryText])
      return embeddedQuery[0].tolist()
    elif(self.modelName == "open_ia"):
      return self.model.embed_query(query)

  def query_topK(self, query, n_results):
    embeddedQuery = self.embed_query(query)

    results = self.collection.query(
      query_embeddings = [embeddedQuery],
      n_results = n_results
    )

    topk = []

    for text in results['documents'][0]:
      topk.append(text)

    return topk

### Inicializa os retrievers

In [None]:
ada_retriever_nltk = CustomRetriver(collection_ada_nltk, ada_model, "open_ia")

In [None]:
ada_retriver_sherpa = CustomRetriver(collection_ada_sherpa, ada_model, "open_ia")

In [None]:
ada_retriever_recursive = CustomRetriver(collection_ada_RecursiveCharacterTextSplitter, ada_model, "open_ia")

In [None]:
e5_retriever = CustomRetriver(collection_e5_large, multilingual_e5_large, "e5_large")

## Avaliação do sistema de retrieval

### Metricas utilizadas para avaliar o retrieval

In [None]:
def calculate_map(relevant_texts, retrieved_texts):
  total_precision = 0
  relevant_count = 0
  precision_values = []

  for i, text in enumerate(retrieved_texts, start = 1):
    if(any([trecho.lower() in text.lower() for trecho in relevant_texts])):
      relevant_count += 1
      precision_at_i = relevant_count / i
      precision_values.append(precision_at_i)

  if not precision_values:
      return 0

  map_score = sum(precision_values) / len(precision_values)
  return map_score

In [None]:
import math

def calculate_ndcg(relevant_texts, retrieved_texts):
    def calculate_dcg(relevances):
        dcg = 0
        for i, rel in enumerate(relevances, start=1):
            dcg += (rel) / math.log2(i + 1)
        return dcg

    def calculate_idcg(relevances):
        sorted_relevances = sorted(relevances, reverse=True)
        return calculate_dcg(sorted_relevances)

    relevances = [1 if any([trecho.lower() in text for trecho in relevant_texts]) else 0 for text in retrieved_texts]

    dcg = calculate_dcg(relevances)

    idcg = calculate_idcg(relevances)

    if idcg == 0:
        ndcg = 0
    else:
        ndcg = dcg / idcg

    return ndcg


In [None]:
def calculate_recall_at_k(relevant_texts, retrieved_texts, k):
    relevant_count = 0

    for i, text in enumerate(retrieved_texts[:k], start = 1):
      if(any([trecho.lower() in text.lower() for trecho in relevant_texts[:k]])):
        relevant_count += 1

    recall_at_k = relevant_count / len(relevant_texts) if len(relevant_texts) > 0 else 0
    return recall_at_k

In [None]:
def calculate_precision_at_k(relevant_texts, retrieved_texts, k):
    relevant_count = 0

    for i, text in enumerate(retrieved_texts[:k], start = 1):
      if(any([trecho.lower() in text.lower() for trecho in relevant_texts[:k]])):
        relevant_count += 1

    precision_at_k = relevant_count / k if k > 0 else 0
    return precision_at_k

In [None]:
def calculate_f1_score(precision, recall):
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1_score

### Função de avaliação do retrieval

In [None]:
def media(resultado, nome):
  return f"{nome}: {sum(resultado[nome]) / len(resultado[nome])}\n"

In [None]:
def salvar_retrieval_results(resultado, nome_exp):
  with open("/content/drive/MyDrive/NeuralmindChatBot/analises/retrieval.txt", "a") as file:
    res = f"{nome_exp}: \n\n"
    res = res + media(resultado, "maps") + media(resultado, "ndcg") + media(resultado, "precision_at_three") + media(resultado, "recall_at_three") + media(resultado, "f1_score") + "\n\n"

    file.write(res)

In [None]:
import csv
import time

def evaluate_retrieval(retrieval_model, dataset_path, model_name):

    maps = []
    ndcg = []
    precision_at_three = []
    recall_at_three = []
    f1_score = []

    with open(dataset_path, 'r') as file:
        dataset = csv.reader(file)

        for i, row in enumerate(dataset):
            question = row[0]
            relevant_texts = eval(row[1])
            answer = row[2]

            time.sleep(21)

            retrieved_texts = retrieval_model.query_topK(question, 5)

            if(model_name == "e5_large"):
              retrieved_texts = [text.replace("passage: ", "", 1) for text in retrieved_texts]

            time.sleep(1)

            # calculo das metricas MAP e NDCG
            maps.append(calculate_map(relevant_texts, retrieved_texts))
            ndcg.append(calculate_ndcg(relevant_texts, retrieved_texts))

            #calculo das metricas precision, recall and F1 score
            precision = calculate_precision_at_k(relevant_texts, retrieved_texts, 3)
            recall = calculate_recall_at_k(relevant_texts, retrieved_texts, 3)
            precision_at_three.append(precision)
            recall_at_three.append(recall)
            f1_score.append(calculate_f1_score(precision, recall))

    return {
        "maps": maps,
        "ndcg": ndcg,
        "precision_at_three": precision_at_three,
        "recall_at_three": recall_at_three,
        "f1_score": f1_score
    }

### calculando os resultados do retrieval

Os passos a seguir devem ser feitos para todos os retrievers

In [None]:
resultado = evaluate_retrieval(ada_retriever_nltk, "/content/validacaoFinal.csv", "open_ia")

In [None]:
salvar_retrieval_results(resultado, "embedding ada_002 com Nltk e dataset final")

## Avaliação das respostas

### Metricas para avaliar respostas

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk import word_tokenize

def calculate_precision_recall_f1_tokens(predicted_answer, ground_truth):
    """
    Calcula precision, recall e f1 score considerando os tokens das strings
    predicted_answer e ground_truth.
    """

    predicted_tokens = word_tokenize(predicted_answer.lower())
    ground_truth_tokens = word_tokenize(ground_truth.lower())

    TP = len(set(predicted_tokens) & set(ground_truth_tokens))

    FP = len(set(predicted_tokens) - set(ground_truth_tokens))

    FN = len(set(ground_truth_tokens) - set(predicted_tokens))

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    return precision, recall, calculate_f1_score(precision, recall)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
model = CrossEncoder('cross-encoder/stsb-roberta-large')

In [None]:
def similarity(predicted, expected):
  res = model.predict([predicted, expected])
  return res

### Função de avaliação de respostas

#### Instanciando o LLM

In [None]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(max_tokens=250, model = "gpt-3.5-turbo", temperature=0.1)

In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """Considering these texts as context: {context}.
    Give me a brilliant answer to the following question: {question}

    Make sure to answer using Portuguese language"""
)

In [None]:
chain = prompt | chat

#### Função de avaliação das respostas

In [None]:
import csv
import time

def evaluate_question_answering(chain, retriever, dataset_path, model_name):

  predictions = []

  precision_list = []
  recall_list = []
  f1_score_list = []
  cross_encoder_similarity = []


  with open(dataset_path, 'r') as file:
        dataset = csv.reader(file)

        for i, row in enumerate(dataset):
            question = row[0]
            human_answer = row[2]

            retrieved_texts = retriever.query_topK(question, 10)

            if(model_name == "e5_large"):
              retrieved_texts = [text.replace("passage: ", "", 1) for text in retrieved_texts]

            context = "\n".join(retrieved_texts)

            predicted_answer = chain.invoke({"context": context, "question": question})

            time.sleep(21)

            predicted_answer = predicted_answer.content

            precision, recall, f1_score = calculate_precision_recall_f1_tokens(predicted_answer, human_answer)

            precision_list.append(precision)
            recall_list.append(recall)
            f1_score_list.append(f1_score)

            predictions.append(predicted_answer)

            cross_encoder_similarity.append(similarity(predicted_answer, human_answer))

  return {
      "precision_list": precision_list,
      "recall_list": recall_list,
      "f1_score_list": f1_score_list,
      "cross_encoder_similarity": cross_encoder_similarity,
      "answers":  predictions
  }

### calculando resultados das respostas

In [None]:
qa_evaluation = evaluate_question_answering(chain, ada_retriever_nltk, "/content/validacaoFinal.csv", "open_ia")

In [None]:
def salvar_qa_results(resultado, nome_exp):
  with open("/content/drive/MyDrive/NeuralmindChatBot/analises/qa_evaluate.txt", "a") as file:
    res = f"{nome_exp}: \n\n"

    bleu_str = f'bleu: {resultado["bleu"]}\n'

    res = res + bleu_str + media(resultado, "precision_list") + media(resultado, "recall_list") + media(resultado, "f1_score_list") + media(resultado, "cross_encoder_similarity") + "\n\n"

    file.write(res)

In [None]:
salvar_qa_results(qa_evaluation, "chat-gpt-3.5-turbo | ada_002 | nltk spliterr | dataset final")

In [None]:
with open("/content/drive/MyDrive/NeuralmindChatBot/analises/respostasNovas.txt", "w") as file:
    for res in qa_evaluation["answers"]:
      file.writelines(res + "\n --------------------------------------- \n\n")