In [1]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma

from dotenv import load_dotenv


load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")


TypeError: cannot pickle 'classmethod' object

In [8]:
llm = ChatGoogleGenerativeAI(google_api_key=google_api_key, model="gemini-1.5-pro-latest")
llm.invoke("Hello, how are you?")


AIMessage(content='I am doing well, thank you for asking! How are you today?\n', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run-0b406eef-a915-4a84-aa68-061e41267b57-0', usage_metadata={'input_tokens': 7, 'output_tokens': 16, 'total_tokens': 23, 'input_token_details': {'cache_read': 0}})

In [None]:
embedder = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3"
)
persist_directory = "persisted_embeddings_2"

# pdf_files = glob.glob("./Data/*.pdf")
# pages = []

# for pdf_file in pdf_files:
#     pages.extend(PyPDFLoader(pdf_file).load_and_split())

# doc_chunks = []
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=850,
#     separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
#     chunk_overlap=100,
# )
# chunks = text_splitter.split_documents(pages)
# len(chunks)



# vectordb = Chroma.from_documents(
#     documents=chunks,
#     embedding=embedder,
#     persist_directory=persist_directory
# ) 


In [None]:
vectordb = Chroma(persist_directory=persist_directory,embedding_function=embedder)
retriever = vectordb.as_retriever()
print(vectordb._collection.count())

471


In [None]:
def reply(user_query) : 
    # Retrieve relevant documents
    relevant_docs = retriever.invoke(user_query)
    
    # Format the retrieved documents with content 
    retrieved_context = "\n".join([f"{doc.page_content} \n" for doc in relevant_docs])
    
    # Create the prompt for the LLM
    llm_prompt = f"""
    You are a helpful assistant specialized in answering queries based on specific documents.
    Your role is to provide accurate, detailed answers that show how you used the context to answer.
    Always base your response on the information retrieved.
    
    Context:
    {retrieved_context}
    
    Question: {user_query}
    """
    
    # Get the response from the language model
    response = llm.invoke(llm_prompt).content
    # Extract and format document names
    unique_document_names = {doc.metadata.get('source', 'Unknown') for doc in relevant_docs}
    document_references = "\n".join(f"- {name}"[9:] for name in unique_document_names)


    # Combine the LLM response with document references
    final_response = f"{response}\n\n**References:**\n{document_references}"
    print(final_response)

In [None]:
reply("What do you know about the CEO of Gitlab?")

The CEO of GitLab is Sid Sijbrandij. He is also the co-founder and board chair of the company. He believes in transparency and iteration, and that negative feedback is important for improvement. He also believes that values are not binary and that there is always room for interpretation.  He emphasizes that the results matter most and that transparency should not be pursued for its own sake. 


**References:**
GitLab Values _ The GitLab Handbook.pdf
CEO _ The GitLab Handbook.pdf


In [13]:
from typing import Any
import os
from dotenv import load_dotenv
from unstructured.partition.pdf import partition_pdf
load_dotenv()
path = "./figures/"

# Get elements
raw_pdf_elements = partition_pdf(
    filename="./Data/About the Handbook _ The GitLab Handbook.pdf",
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

In [2]:
raw_pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x2184f1a74d0>,
 <unstructured.documents.elements.CompositeElement at 0x2184f1a5c40>,
 <unstructured.documents.elements.Table at 0x2184f1a6de0>,
 <unstructured.documents.elements.CompositeElement at 0x2184f1a7530>,
 <unstructured.documents.elements.Table at 0x2184f1284a0>,
 <unstructured.documents.elements.CompositeElement at 0x2184f0eede0>,
 <unstructured.documents.elements.Table at 0x2184edb4110>,
 <unstructured.documents.elements.CompositeElement at 0x2184edb5a60>,
 <unstructured.documents.elements.CompositeElement at 0x2184f1a7e60>]

In [3]:
# appending texts and tables from the pdf file
def data_category(raw_pdf_elements): # we may use decorator here
    tables = []
    texts = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
           tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
           texts.append(str(element))
    data_category = [texts,tables]
    return data_category
texts = data_category(raw_pdf_elements)[0]
tables = data_category(raw_pdf_elements)[1]

In [4]:
texts

['22/10/2024, 14:54\n\nAbout the Handbook | The GitLab Handbook\n\nThe GitLab Handbook\n\nGitLab TeamOps Handbook Job Families Reports\n\nAbout the Handbook\n\nHistory of the handbook\n\nThe handbook started when GitLab was a company of just ten people to make sharing information efficient and easy. We knew that future GitLab team-members wouldnʼt be able to see emails about process changes that were being sent before they joined and that most of the people who would eventually join GitLab likely hadnʼt even heard of us yet. The handbook was our way of ensuring that all of our company information was accessible to everyone regardless of when they became part of the team.\n\nAdvantages\n\nAt GitLab our handbook is extensive and keeping it relevant is an important part of\n\neveryoneʼs job. It is a vital part of who we are and how we communicate. We established these processes because we saw these benefits:\n\n. Reading is much faster than listening.\n\n. Reading is async, you donʼt have

In [5]:
tables

['2018-01-01 298,806 228 2018-10-01 427,929 335 2019-01-01 520,519 439 2019-04-01 656,668 586 2019-07-01 818,064 766 2019-10-01 987,397 884 2020-01-01 1,204,642 1,035 2020-04-01 1,491,017 1,222 2020-07-01 1,851,350 1,488',
 '2020-10-01 2,166,627 1,759 2021-01-01 2,410,554 1,914 2021-04-01 2,615,372 2,056 2021-07-01 2,956,781 2,271 2021-10-01 3,138,952 2,355 2022-01-01 3,280,108 2,395 2022-04-01 3,474,993 2,553 2022-07-01 3,628,280 2,641 2022-10-01 3,732,384 2,724 2023-01-01 3,732,186 2,722 ⬅ Last entry before migration 2023-07-01 3,905,979 2,743 2023-10-01 3,478,407 2,306 2023-12-22 0 0 ⬅ Handbook Migration Complete',
 '2023-07-01 150,732 133 2023-10-01 1,909,139 1,403 2024-01-01 3,631,360 3,003 ⬅ Migration Completed 2024-10-22 3,855,949 3,086 ⬅ Hugo generated Live Count']

In [9]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser


def tables_summarize(data_category):
    prompt_text = """You are an assistant tasked with summarizing tables. \
                    Give a concise summary of the table. Table chunk: {element} """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()
    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
    

    return table_summaries
table_summaries = tables_summarize(data_category)
text_summaries = texts

In [10]:
table_summaries

['The table shows consistent growth in two numerical variables between January 2018 and July 2020.  The first variable increases from approximately 300,000 to 1,850,000 and the second from 228 to 1,488 over this period.\n',
 'The table shows a general upward trend in both columns (likely representing some kind of volume and count) from October 2020 to July 2023, followed by a drop in both columns in October 2023 and finally reaching zero in December 2023, marked as "Handbook Migration Complete".  This suggests the migration caused a reset or transfer of the tracked quantities.\n',
 'Migration was completed on 2024-01-01 with 3,631,360 items and 3,003 users.  A later live count on 2024-10-22 showed 3,855,949 items and 3,086 users.\n']

In [11]:
import base64
from langchain.schema.messages import HumanMessage, SystemMessage

def encode_image(image_path):
    ''' Getting the base64 string '''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_captioning(img_base64,prompt):
    ''' Image summary '''

    msg = llm.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text":prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_base64}"
                        },
                    },
                ]
            )
        ]
    )
    return msg.content



In [17]:
import time
# Store base64 encoded images
img_base64_list = []

# Store image summaries
image_summaries = []

# Prompt : Our prompt here is customized to the type of images we have which is chart in our case
prompt = "Describe the image in detail. Be specific about graphs, such as bar plots."

# Read images, encode to base64 strings
for img_file in sorted(os.listdir(path)):
    if img_file.endswith('.jpg'):
        img_path = os.path.join(path, img_file)
        base64_image = encode_image(img_path)
        img_base64_list.append(base64_image)
        img_capt = image_captioning(base64_image,prompt)
        # time.sleep(60)
        image_summaries.append(img_capt)

In [16]:
img_capt

'The image is a black rectangular box with a thin white border. Inside the box, in white text, is "(CC) BY-SA". The text is left-aligned.  There are no graphs or other elements besides the text and the border.'

In [None]:
from base64 import b64decode 

def split_image_text_types(docs):
    ''' Split base64-encoded images and texts '''
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception as e:
            text.append(doc)
    return {
        "images": b64,
        "texts": texts
    }

In [19]:
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
import uuid
from langchain.schema.document import Document


embedder = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3"
)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag",
                     embedding_function=embedder)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# Add image summaries
img_ids = [str(uuid.uuid4()) for _ in img_base64_list]
summary_img = [
    Document(page_content=s, metadata={id_key: img_ids[i]})
    for i, s in enumerate(image_summaries)
]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(list(zip(img_ids, img_base64_list)))


In [34]:
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda


def prompt_func(dict):
    format_texts = "\n".join(dict["context"].get("texts", []))

    # Construct the message content with text and tables
    content = [
        {
            "type": "text",
            "text": f"""Answer the question based on the following context, which can include text, tables, and optionally an image : make your 
            answer as detailed as possible and provide reasoning for your answer : 
Question: {dict["question"]}

Text and tables:
{format_texts}
""",
        }
    ]

    # Add the image only if it exists
    if dict["context"].get("images"):
        content.append(
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{dict['context']['images'][0]}"
                },
            }
        )

    # Return the formatted HumanMessage
    return [HumanMessage(content=content)]


# RAG pipeline
chain = (
    {
        "context": retriever | RunnableLambda(split_image_text_types),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(prompt_func)
    | llm
    | StrOutputParser()
)

In [35]:
chain.invoke("tell me about handbook pages count")

'The GitLab handbook is stated to be "over two thousand pages long".  While specific historical word and page counts are referenced as being available, the provided text doesn\'t display any of those historical values. It does describe the methods used to calculate these metrics:\n\n* **Page Count:** `grep -l -r "\\" * | wc -l` run from the root of the repository.  This command essentially counts the number of files containing a backslash, which seems like an unusual way to count pages. It\'s possible this is a typo and a different character was intended.\n* **Word Count:** `find sites/handbook/source/handbook -type f -name "*.md" -o -name "*.md.erb" | xargs wc -w` run from the root of the repository. This command finds all `.md` and `.md.erb` files within the specified directory and counts the words in them.\n\nThe text also mentions using Snowplow and Tableau for tracking handbook usage and viewing statistics, and links to a few potentially relevant pages (which are inaccessible with