In [1]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma

from dotenv import load_dotenv


load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")


In [2]:
llm = ChatGoogleGenerativeAI(google_api_key=google_api_key, model="gemini-1.5-flash")
llm.invoke("Hello, how are you?")


AIMessage(content="I am an AI language model, so I don't have feelings or experiences like humans do. But I am here and ready to assist you with any questions or tasks you may have! How can I help you today? \n", additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-c0be07f2-bd5b-441f-86c4-ff3ab7ac91db-0', usage_metadata={'input_tokens': 7, 'output_tokens': 45, 'total_tokens': 52, 'input_token_details': {'cache_read': 0}})

In [None]:
embedder = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3"
)
persist_directory = "persisted_embeddings"

# pdf_files = glob.glob("./Data/*.pdf")
# pages = []

# for pdf_file in pdf_files:
#     pages.extend(PyPDFLoader(pdf_file).load_and_split())

# doc_chunks = []
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=850,
#     separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
#     chunk_overlap=100,
# )
# chunks = text_splitter.split_documents(pages)
# len(chunks)



# vectordb = Chroma.from_documents(
#     documents=chunks,
#     embedding=embedder,
#     persist_directory=persist_directory
# ) 


In [4]:
vectordb = Chroma(persist_directory=persist_directory,embedding_function=embedder)
retriever = vectordb.as_retriever()
print(vectordb._collection.count())

471


In [35]:
def reply(user_query) : 
    # Retrieve relevant documents
    relevant_docs = retriever.invoke(user_query)
    
    # Format the retrieved documents with content 
    retrieved_context = "\n".join([f"{doc.page_content} \n" for doc in relevant_docs])
    
    # Create the prompt for the LLM
    llm_prompt = f"""
    You are a helpful assistant specialized in answering queries based on specific documents.
    Your role is to provide accurate, detailed answers that show how you used the context to answer.
    Always base your response on the information retrieved.
    
    Context:
    {retrieved_context}
    
    Question: {user_query}
    """
    
    # Get the response from the language model
    response = llm.invoke(llm_prompt).content
    # Extract and format document names
    unique_document_names = {doc.metadata.get('source', 'Unknown') for doc in relevant_docs}
    document_references = "\n".join(f"- {name}"[9:] for name in unique_document_names)


    # Combine the LLM response with document references
    final_response = f"{response}\n\n**References:**\n{document_references}"
    print(final_response)

In [36]:
reply("What do you know about the CEO of Gitlab?")

The CEO of GitLab is Sid Sijbrandij. He is also the co-founder and board chair of the company. He believes in transparency and iteration, and that negative feedback is important for improvement. He also believes that values are not binary and that there is always room for interpretation.  He emphasizes that the results matter most and that transparency should not be pursued for its own sake. 


**References:**
GitLab Values _ The GitLab Handbook.pdf
CEO _ The GitLab Handbook.pdf


In [2]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

path = "./Data_raw/"

# Get elements
raw_pdf_elements = partition_pdf(
    filename="./Data/About the Handbook _ The GitLab Handbook.pdf",
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

Failed to get OCRAgent instance: No module named 'paddle'


RuntimeError: Could not get the OCRAgent instance. Please check the OCR package and the OCR_AGENT environment variable.