# Teste de embeddings na recuperação de texto com Chroma

# Carregar dataSet

In [None]:
!pip install huggingface_hub datasets
!pip install langchain

In [2]:
from langchain.document_loaders import HuggingFaceDatasetLoader

dataset_name = "squad_v1_pt"
page_content_column = "context"


loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, split="validation")

In [None]:
for i in range(5):
  print(dataset[i]["context"])

In [6]:
from langchain.schema import Document

contextos = [Document(page_content = x["context"], metadata= {"length": len(x["context"])}) for x in dataset]

In [7]:
def remove_duplicates(documents):
    unique_documents = set()
    result = []

    for document in documents:
        if document.page_content not in unique_documents:
            result.append(document)
            unique_documents.add(document.page_content)

    return result

In [8]:
contextos = remove_duplicates(contextos)

# Fazer o split

In [None]:
!pip install nltk

In [None]:
import nltk

nltk.download('punkt')

In [59]:
from nltk.tokenize import word_tokenize

def count_words(input_string):
    words = word_tokenize(input_string)
    return len(words)

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 1,
    length_function = len,
    is_separator_regex = False,
)

In [10]:
texts = text_splitter.split_documents(contextos)

In [None]:
for i in range(5):
  print(texts[i].page_content)
  print()

# Persistir os dados

In [12]:
!pip install chromadb
!pip install sentence_transformers

[K     |████████████████████████████████| 731.7 MB 33 kB/s  eta 0:00:013                | 164.2 MB 1.1 MB/s eta 0:08:19               | 213.9 MB 9.6 MB/s eta 0:00:55��███████                    | 272.3 MB 276 kB/s eta 0:27:40��████████▏                   | 278.5 MB 8.3 MB/s eta 0:00:55███████████████▊                | 358.6 MB 6.1 MB/s eta 0:01:02��█████▎               | 372.5 MB 4.3 MB/s eta 0:01:239 MB 4.3 MB/s eta 0:01:23     |████████████████▋               | 380.1 MB 10.6 MB/s eta 0:00:34 |████████████████▊               | 381.6 MB 10.6 MB/s eta 0:00:33��██████               | 388.0 MB 3.3 MB/s eta 0:01:444 MB 3.3 MB/s eta 0:01:44     |██████████████████▍             | 421.1 MB 5.4 MB/s eta 0:00:58��█▍            | 443.6 MB 12.2 MB/s eta 0:00:24�███████▊            | 452.1 MB 709 kB/s eta 0:06:35�████████▊           | 474.0 MB 6.8 MB/s eta 0:00:39�█████████           | 478.0 MB 6.8 MB/s eta 0:00:38     |█████████████████████▍          | 488.6 MB 6.9 MB/s eta 0:00:36██████████████

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer
from transformers import AutoModel

model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [13]:
import chromadb

chroma_client = chromadb.Client()

collection = chroma_client.create_collection(name="textos")

In [44]:
tokens = []
embeddings = []
documents = []

for i in range(5):
  embeddings.append(model(tokenizer.encode(texts[i].page_content, return_tensors="pt")).pooler_output.tolist()[0])
  documents.append(texts[i].page_content)


In [None]:
print(embeddings[0])

In [None]:
tokens = [tokenizer.encode(x.page_content, return_tensors="pt") for x in texts]
embeddings = [model(x).pooler_output.tolist()[0] for x in tokens]
documents = [x.page_content for x in texts]

In [None]:
for i in range(5):
  print(embeddings[i])
  print(len(embeddings))

In [46]:
collection.add(
    embeddings = embeddings,
    documents = documents,
    metadatas=[{"source": "my_source"} for i in range(len(embeddings))],
    ids=[str(i) for i in range(len(documents))]
)

# Recuperar textos por query

In [60]:
queryText = "Qual time da NFL representou o AFC no Super Bowl 50?"

input_id = tokenizer.encode(queryText, return_tensors = 'pt')

queryEmbed = model(input_id).pooler_output.tolist()[0]

results = collection.query(
    query_embeddings = [queryEmbed],
    n_results=10
)



In [61]:
for x in results['documents'][0]:
  print(x)
  print()

Game. Eles se juntaram aos Patriots, Dallas Cowboys e Pittsburgh Steelers como um dos quatro times que fizeram oito aparições no Super Bowl.

Os Panteras terminaram a temporada regular com um recorde de 15-1, e o quarterback Cam Newton foi nomeado o Jogador Mais Valioso da NFL (MVP). Eles derrotaram os Arizona Cardinals por 49 a 15 no NFC Championship Game e avançaram para sua segunda apresentação no Super Bowl desde que a franquia foi fundada em 1995. Os Broncos terminaram a temporada regular com um recorde de 12-4 e negaram ao New England Patriots uma chance para defender seu título do Super Bowl XLIX, derrotando-os 20-18 no AFC Championship

Os Broncos assumiram uma liderança inicial no Super Bowl 50 e nunca se arrastaram. Newton foi limitado pela defesa de Denver, que o demitiu sete vezes e forçou-o em três turnovers, incluindo um fumble que eles recuperaram para um touchdown. O linebacker de Denver, Von Miller, foi nomeado MVP do Super Bowl, registrando cinco tackles individuais, 

In [None]:
# Medir a precissão da recuperação