In [None]:
# Importando bibliotecas necessárias
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from PyPDF2 import PdfReader

# Carregando variáveis de ambiente
load_dotenv()

In [None]:
# Carregando o documento pdf
def laoder_pdf(pdf_docs):
    doc = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            doc += page.extract_text() if page.extract_text() else ""
             
    return doc

In [None]:
# Definindo como o documento será dividido
def split_pdf(doc, chunk_size=256, chunk_overlap=23):
    pdf_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        lenght_function=len
    )
    return pdf_splitter.split(doc)

In [None]:
# Embedding dos chunks
def embed_pdf_chunks(chunks):
    embeddings = OpenAIEmbeddings()

In [None]:
# Fazendo o teste com arquivo de TXT
loader = TextLoader('state_of_the_union.txt', encoding='utf-8')
documents = loader.load()

In [None]:
# Verificando se o documento foi carregado corretamente
print(documents)
print(len(documents))

In [None]:
# Separando o texto em pedaços menores
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=23)
texts = text_splitter.split_documents(documents)

In [None]:
# Verificando quantos pedaços foram gerados
print(len(texts))

In [None]:
# Verificando o conteúdo de um dos pedaços
embeddings = OpenAIEmbeddings()
doc_vectors = embeddings.embed_documents([t.page_content for t in texts[:5]])

In [None]:
# Conectando nossos embeddings ao banco de dados PostgreSQL
from langchain_community.vectorstores.pgvector import PGVector

# Conectando ao banco de dados
CONNECTION_STRING = "postgresql+psycopg2://postgres:neki@localhost:5432/nekivector_db"
# Nome da coleção de vetores
COLLECTION_NAME = 'state_of_the_union_vectors'

# Inserindo os vetores no banco de dados
db = PGVector.from_documents(embedding=embeddings, documents=texts, collection_name=COLLECTION_NAME, connection_string=CONNECTION_STRING)

In [None]:
# Criando uma query para verificar se os vetores foram inseridos corretamente
query = "Russia?"

# Comparando a query com os vetores
print(embeddings.embed_query(query))