# 1. Vector Database Generation
This code generates the vector database for a PDF document. For this application, a car manual.
This code should be run only once, since the results are save in disk.

Authors:
- Luis Bernardo Hernandez Salinas
- Juan R. Terven

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

## Set API key, embedding and vector database path

In [None]:
client = os.environ['OPENAI_API_KEY']
embedding_dimensions = 3072 #1536  # 3072

# OpenAI embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=embedding_dimensions)

# Vector dataset
manual_file = r"C:\Users\jrtervens\Dropbox\Projects\SACI\2020-Ford-Mustang-Owners-Manual-version-2_om_EN-US_12_2019.pdf"
vectordb_directory = f'vector_database_mustang_{embedding_dimensions}'

## Load PDF

In [None]:
loader = PyPDFLoader(manual_file)

# load pdf pagesÇ
pages = loader.load()
print(f"The document has {len(pages)} pages")

In [None]:
# Lets check one page
pages[100]

## Documents Splitting

In [None]:
# RecursiveCharacterTextSplitter with overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,  # chunk size in characters
    chunk_overlap = 150 # Caracteres de solapamiento entre segmentos consecutivos.
)

# split documents
splits = text_splitter.split_documents(pages)

print(f"Generated {len(splits)} splits")

## Create Vector database

In [None]:
# Create vector database from the embedding of the documents splits 
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    persist_directory=vectordb_directory
)

print(f"Saved {vectordb._collection.count()} collections in vector database")