In [None]:
import os
import traceback
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.document_loaders import PDFMinerLoader
from tqdm import tqdm
import logging
import datetime

# Configuraci√≥n del logger
logging.basicConfig(
    filename=f"/home/m/Escritorio/RAG/logs/{str(datetime.datetime.today().strftime('%Y-%m-%d'))}_main.log",  # Guardar logs en un archivo
    level=logging.DEBUG,  # Puedes cambiarlo a INFO en producci√≥n
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logging.info("Inicio\n")


# Paths
PDF_FOLDER = Path("data")
CHROMA_PATH = "db"

# Embedding model (local)
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


def load_pdfs(pdf_folder):
    logging.getLogger("pdfminer").setLevel(logging.WARNING)
    logging.info(f"üì• Cargando PDFs desde '{pdf_folder}'...")
    documents = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            try:
                loader = PDFMinerLoader(file_path)
                docs = loader.load()
                documents.extend(docs)
                logging.info(f"‚úîÔ∏è {filename} cargado con {len(docs)} p√°ginas usando PDFMinerLoader.")
            except Exception as e:
                logging.warning(f"‚ö†Ô∏è Error con {filename} usando PDFMinerLoader: {e}")
                logging.info("üîÅ Intentando cargar con PyMuPDFLoader...")
                traceback.print_exc()
                try:
                    fallback_loader = PyMuPDFLoader(file_path)
                    docs = fallback_loader.load()
                    documents.extend(docs)
                    logging.info(f"‚úîÔ∏è {filename} cargado con PyMuPDFLoader.")
                except Exception as fallback_error:
                    logging.error(f"‚ùå Fallo tambi√©n con fallback en {filename}: {fallback_error}")
                    traceback.print_exc()
    return documents



def split_documents(documents, chunk_size=500, chunk_overlap=50):
    try:
        logging.info("‚úÇÔ∏è Dividiendo documentos en chunks...")
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        docs = splitter.split_documents(documents)
        logging.info(f"‚úÖ Total de chunks: {len(docs)}")
        return docs
    except Exception as e:
        logging.error(f"‚ùå Error al dividir documentos: {e}")
        traceback.print_exc()
        return []


def embed_documents(chunks):
    try:
        logging.info(f"üîç Cargando modelo de embeddings: {EMBEDDING_MODEL_NAME}")
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
        return embeddings
    except Exception as e:
        logging.error(f"‚ùå Error cargando modelo de embeddings: {e}")
        traceback.print_exc()
        return None


def create_vector_store(chunks, embeddings, persist_path):
    try:
        logging.info(f"üß† Creando base vectorial en: {persist_path}")
        vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_path)
        vectordb.persist()
        logging.info("‚úÖ Base vectorial creada y guardada.")
        return vectordb
    except Exception as e:
        logging.error(f"‚ùå Error creando base vectorial: {e}")
        traceback.print_exc()
        return None


if __name__ == "__main__":
    logging.info("üöÄ Iniciando pipeline de ingesta...")

    documents = load_pdfs(PDF_FOLDER)
    if not documents:
        logging.warning("‚ö†Ô∏è No se cargaron documentos. Finalizando.")
        exit()

    chunks = split_documents(documents)
    if not chunks:
        logging.warning("‚ö†Ô∏è No se generaron chunks. Finalizando.")
        exit()

    embeddings = embed_documents(chunks)
    if not embeddings:
        logging.warning("‚ö†Ô∏è No se pudo crear embeddings. Finalizando.")
        exit()

    vectordb = create_vector_store(chunks, embeddings, CHROMA_PATH)
    if not vectordb:
        logging.warning("‚ö†Ô∏è Fallo al crear la base vectorial.")
    else:
        logging.info("üéâ ¬°Ingesta completada correctamente!")
