In [1]:
import os, warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv('.env')

True

### Document Loader

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader
import os

pdfs = []
for root, dirs, files in os.walk('document_store/pdfs'):
    #print (root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

pdfs

['document_store/pdfs\\hillier_lieberman_proof.pdf']

In [3]:
docs = []
# Divide o los documentos por pagina.
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    temp = loader.load()
    docs.extend(temp)

#docs

### Document Chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(docs)
#chunks

In [5]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
#encoding.encode(chunks[0].page_content)
len(encoding.encode(chunks[0].page_content))

101

### Document Vector Embedding

In [6]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


In [7]:
# ollama pull nomic-embed-text
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url='http://localhost:11434')


In [8]:
vector = embeddings.embed_query("Hello World")

In [9]:
index = faiss.IndexFlatL2(len(vector))
index.ntotal, index.d

(0, 768)

In [10]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(), #in RAM
    index_to_docstore_id={},
)

In [11]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x207b2c591c0>

In [12]:
# adding document(s) inside vector store
ids = vector_store.add_documents(documents=chunks)

In [13]:
len(ids), vector_store.index.ntotal

(211, 211)

In [14]:
vector_store.index_to_docstore_id

{0: '8d364e2a-6130-4b2a-89c3-973db0789a11',
 1: '4e808ce3-68f1-48ee-b957-5c189c94393b',
 2: '94e32f47-e5a8-4f5d-b1a0-1d3c4ab22fcf',
 3: '2544f3c1-c05e-4cc4-aa45-0b8ffda01b37',
 4: 'c3af0e84-ee24-405d-ac1c-760998d70d65',
 5: '72f9fdf3-be5a-4d17-9705-327e87349ca7',
 6: 'bc72fe46-27a2-43ae-b03f-afa517343332',
 7: 'bcb61644-723e-411f-bb35-8e855ce33695',
 8: 'ffcff323-31d0-45c3-81ae-08d91ec1c23c',
 9: '723c9079-d4bb-4e14-8036-4f4cc44df103',
 10: 'e8416cf3-d864-4d72-84d3-2d0bf0777325',
 11: '65d1d199-197c-4b74-9aa9-1fe5a1b9e9bf',
 12: 'a5456de6-f461-48fa-93bc-4ead4766ec88',
 13: 'b579835f-0014-4758-aa2a-3e87fb5e4508',
 14: '0f259a97-1031-49d8-8193-23eaf808f336',
 15: '8b44485a-b1a4-4659-aa34-5f60271cc3f8',
 16: 'f15c3a3f-bd6e-4445-8930-6625018dce03',
 17: 'd63559e7-2843-43f2-bddf-14e858d902a1',
 18: '7aa62d3b-0303-4a5e-a306-4051a13532b2',
 19: '525dd8d6-453f-4a78-875b-f56a913b3bf8',
 20: '715b15d4-d893-4c1a-a623-04f2e5fee5ee',
 21: '46fed0fe-692f-467b-a3d4-1d4bfaa31a64',
 22: 'd88c11a8-b1e2-

In [15]:
#Retrieval
question = "¿Cuáles son los efectos de la investigación operativa?"
docs = vector_store.search(query=question, k=5, search_type="similarity")

In [16]:
docs

[Document(id='7aa62d3b-0303-4a5e-a306-4051a13532b2', metadata={'producer': 'Adobe Acrobat 8.1', 'creator': 'Adobe Acrobat 8.1 Combine Files', 'creationdate': '2010-05-24T10:07:08-05:00', 'source': 'document_store/pdfs\\hillier_lieberman_proof.pdf', 'file_path': 'document_store/pdfs\\hillier_lieberman_proof.pdf', 'total_pages': 20, 'format': 'PDF 1.6', 'title': 'Introduccion a la investigacion de operaciones', 'author': 'Frederick S. Hillier', 'subject': '', 'keywords': '', 'moddate': '2010-07-07T13:08:58-05:00', 'trapped': '', 'page': 1}, page_content='como validación del modelo. En cierto sentido, la IO involucra la investigación científi ca creativa \nde las propiedades fundamentales de las operaciones. Sin embargo, es más que esto. La IO se ocupa \ntambién de la administración práctica de la organización. Por lo tanto, para tener éxito, también debe \nproporcionar conclusiones claras que el tomador de decisiones pueda usar cuando sea necesario.\nOtra característica de la investigaci

In [17]:
db_name = "operations_research"
vector_store.save_local(db_name)