### Load documents list

In [1]:
import os

directory_path = 'data/'
document_list = os.listdir(directory_path)
print(document_list)

['2017-Attention-is-All-You-Need.pdf', '2022-STaR-Self-Taught-Reasoner.pdf', '2023-GPT-4-Technical-Report.pdf', '2023-Lets-Verify-Step-by-Step.pdf', '2023-Sparks-of-AGI.pdf', '2023-Tree-of-Thoughts.pdf']


### Parse documents into lists of text and create its metadata

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader
import uuid

documents = []
metadatas = []
ids = []

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 50,
    length_function = len,
    is_separator_regex = False,
)

for document_name in document_list:
    reader = PdfReader(directory_path + document_name)
    for page in reader.pages:
        page_text = page.extract_text()
        chunks = text_splitter.create_documents([page_text])
        for chunk in chunks:
            documents.append(chunk.page_content)
            metadatas.append({'source': document_name, 'page_number': reader.get_page_number(page)})
            ids.append(str(uuid.uuid4()))

### Create DB connection and embeddings function

In [3]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.EphemeralClient()
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

### Create and store documents data as embeddings in vector data base

In [4]:
collection = chroma_client.create_collection(name="pdf_data")
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

### Test querry

In [5]:
results = collection.query(
    query_texts=["attention module"],
    n_results=10
)

results

{'ids': [['7572c2c5-bf5b-4034-988d-f1784250058a',
   '1a516c27-4617-4e46-8329-b0300ad4b0a1',
   '1aa6df85-5897-4d37-9fa7-dfda0ee9d8c0',
   '29cbf84f-6cc8-43c0-a068-2600147174f1',
   '42ec8ba8-84fb-4baf-a498-edcabc6278c4',
   '3767b271-11f3-4b51-bcbd-9f0aeab94a6d',
   '3004e301-e805-40d7-88c5-dec16bc0de36',
   'df22b2d7-13d9-437e-a675-6e66fd6b412f',
   '017569b2-7f5f-448a-a364-d635be0e534c',
   'a38789e9-7f79-471a-99ef-7293ded66c71']],
 'distances': [[0.6938372850418091,
   0.7801573872566223,
   0.8439009189605713,
   0.9090184569358826,
   0.920831561088562,
   0.9480830430984497,
   0.9625264406204224,
   0.962557315826416,
   0.9701042771339417,
   0.972396731376648]],
 'metadatas': [[{'page_number': 2,
    'source': '2017-Attention-is-All-You-Need.pdf'},
   {'page_number': 6, 'source': '2017-Attention-is-All-You-Need.pdf'},
   {'page_number': 9, 'source': '2017-Attention-is-All-You-Need.pdf'},
   {'page_number': 4, 'source': '2017-Attention-is-All-You-Need.pdf'},
   {'page_number':