In [8]:
# ! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all fastembed langchain-groq 

In [3]:
import os
import tempfile
import base64
import fitz
import hashlib
import tiktoken
import io
import json
from pprint import pprint
from langchain_community.vectorstores import FAISS
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain.output_parsers import OutputFixingParser
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

from langchain_community.document_loaders import WebBaseLoader
from pipeline.api_handler import ApiHandler

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [16]:
# Custom function to extract document objects from a given PDF file directory
def extract_documents_from_file(file_dir):
    loader = PyMuPDFLoader(file_dir)

    # Load the document
    documents = loader.load()
    return documents

def get_llm(llm_type, para):
    para = para
    api = ApiHandler(para)
    llm_basic = api.models['basic']['instance']
    llm_advance = api.models['advance']['instance']
    llm_creative = api.models['creative']['instance']
    if llm_type == 'basic':
        return llm_basic
    elif llm_type == 'advance':
        return llm_advance
    elif llm_type == 'creative':
        return llm_creative
    return llm_basic

def get_embedding_models(embedding_model_type, para):
    para = para
    api = ApiHandler(para)
    embedding_model_default = api.embedding_models['default']['instance']
    if embedding_model_type == 'default':
        return embedding_model_default
    else:
        return embedding_model_default
    
def get_db(_documents, embedding_folder, embedding_model):
    """
    Create or load the embeddings for the specified documents.
    """
    # Define the default filenames used by FAISS when saving
    faiss_path = os.path.join(embedding_folder, "index.faiss")
    pkl_path = os.path.join(embedding_folder, "index.pkl")

    # Check if all necessary files exist to load the embeddings
    if os.path.exists(faiss_path) and os.path.exists(pkl_path):
        # Load existing embeddings
        print("Loading existing embeddings...")
        db = FAISS.load_local(
            embedding_folder, embedding_model, allow_dangerous_deserialization=True
        )
    else:
        # Split the documents into chunks
        print("Creating new embeddings...")
        # text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
        texts = text_splitter.split_documents(_documents)
        print(f"length of document chunks generated for get_response_source:{len(texts)}")

        # Create the vector store to use as the index
        db = FAISS.from_documents(texts, embedding_model)
        # Save the embeddings to the specified folder
        db.save_local(embedding_folder)
    return db

In [17]:
para = {
    'llm_source': 'openai',  # or 'anthropic'
    'temperature': 0,
    "creative_temperature": 0.5,
    "openai_key_dir": ".env",
    "anthropic_key_dir": ".env",
}
llm = get_llm('basic', para)
embed_model = get_embedding_models('default', para)

In [None]:
dir_path = os.getcwd()
embedding_folder = os.path.join(dir_path, "embedded_content")
inputs_folder = os.path.join(dir_path, "input_files")

doc = extract_documents_from_file(inputs_folder + "/test.pdf")
docs = [doc]
docs_list = [item for sublist in docs for item in sublist]
print(f"len of documents :{len(docs_list)}")

# Compute a hashed ID based on the PDF content
doc_content = b"".join([page.page_content.encode('utf-8') for page in docs_list])
file_hash = hashlib.md5(doc_content).hexdigest()
course_id = file_hash
embedding_folder = os.path.join(embedding_folder, course_id)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)
print(f"length of document chunks generated :{len(doc_splits)}")

vectorstore = get_db(docs_list, embedding_folder, embed_model)
retriever = vectorstore.as_retriever(search_kwargs={"k":2})

len of documents :7
length of document chunks generated :23
Loading existing embeddings...
