In [None]:
import os

import argparse
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [None]:
RES_PATH = "../res"
DOCS_PATH = os.path.join(RES_PATH, "docs")
WELLNESS_PATH = os.path.join(DOCS_PATH, "Bienestar")
PHYSICAL_HEALTH_PATH = os.path.join(DOCS_PATH, "Salud Fisica")
MENTAL_HEALTH_PATH = os.path.join(DOCS_PATH, "Salud Mental")
ALL_DOCS = os.path.join(DOCS_PATH, "All_docs")

CHROMA_PATH = os.path.join(RES_PATH, "CHROMA_DB")
DATA_PATH = "data"

openai_keys_file = os.path.join(RES_PATH, os.path.join("keys", "openai_key.txt"))

In [None]:
def extract_keys(file_path):
    keys = {}
    try:
        with open(file_path, 'r') as file:
            for line in file:
                key, value = line.strip().split(':')
                keys[key] = value
    except FileNotFoundError:
        print("El archivo especificado no fue encontrado.")
    except Exception as e:
        print(f"Ocurrió un error al leer el archivo: {e}")
    return keys

# Uso de la función para obtener las claves
keys = extract_keys(openai_keys_file)

# Accediendo a las variables
key_name = keys.get('name')
key_secret = keys.get('secret')

In [None]:
def load_documents(docs_path: str):
    document_loader = PyPDFDirectoryLoader(docs_path)
    return document_loader.load()

In [None]:
%%capture
documents = load_documents(ALL_DOCS)

In [None]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [None]:
chunks = split_documents(documents)

In [None]:
def calculate_chunk_ids(chunks):

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [None]:
def save_to_chroma(chunks: list[Document]):

    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(openai_api_key=key_secret), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}")

In [None]:
save_to_chroma(chunks)