In [1]:
from typing import Any
import os
from dotenv import load_dotenv
from unstructured.partition.pdf import partition_pdf
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
import base64
from langchain.schema.messages import HumanMessage, SystemMessage
from base64 import b64decode 
from dotenv import load_dotenv
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
import uuid
from langchain.schema.document import Document
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

llm = ChatGoogleGenerativeAI(google_api_key=google_api_key, model="gemini-1.5-flash")
embedder = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3"
)
persist_directory = "persisted_embeddings_2"

In [14]:
import os
import base64
import uuid
from dotenv import load_dotenv
from unstructured.partition.pdf import partition_pdf
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.messages import HumanMessage
import time

load_dotenv()

# Paths for input and output directories
input_dir = "./data"
output_dir = "./generated_content"
figures_dir = "./figures"
os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
os.makedirs(figures_dir, exist_ok=True)  # Directory for extracted images


def encode_image(image_path):
    """Encode image as a base64 string."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_captioning(img_base64, prompt):
    """Generate a summary for an image using the llm."""
    msg = llm.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


def process_pdf(path: str, filename):
    """Extract elements from the PDF."""
    filename_no_ext = os.path.splitext(filename)[0]
    os.makedirs(f"{figures_dir}/{filename_no_ext}", exist_ok=True)
    return partition_pdf(
        filename=path,
        extract_images_in_pdf=True,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        extract_image_block_output_dir=f"{figures_dir}/{filename_no_ext}",
    )


def data_category(raw_pdf_elements):  # we may use decorator here
    tables = []
    texts = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))
    data_category = [texts, tables]
    return data_category


def tables_summarize(tables):
    """Generate summaries for tables."""
    prompt_text = """You are an assistant tasked with summarizing tables. \
                    Give a concise summary of the table. Table chunk: {element}"""

    prompt = ChatPromptTemplate.from_template(prompt_text)
    summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()
    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return table_summaries


for filename in os.listdir(input_dir):
    if filename.endswith(".pdf"):
        curr_fig_path = f"{figures_dir}/{os.path.splitext(filename)[0]}"
        # Full path to the PDF
        pdf_path = os.path.join(input_dir, filename)
        # Process PDF to extract raw elements
        raw_pdf_elements = process_pdf(pdf_path, filename)
        # Separate tables and images
        texts, tables = data_category(raw_pdf_elements)
        # Summarize tables
        table_summaries = tables_summarize(tables)
        print(f"Generated summaries for {len(table_summaries)} tables")
        time.sleep(60)
        img_base64_list = []
        # Store image summaries
        image_summaries = []
        # Prompt : Our prompt here is customized to the type of images we have which is chart in our case
        prompt = "Describe the image in detail. Be specific about graphs, such as bar plots."
        # Read images, encode to base64 strings
        for img_file in sorted(os.listdir(curr_fig_path)):
            if img_file.endswith(".jpg"):
                img_path = os.path.join(curr_fig_path, img_file)
                base64_image = encode_image(img_path)
                img_base64_list.append(base64_image)
                img_capt = image_captioning(base64_image, prompt)
                # time.sleep(60)
                image_summaries.append(img_capt)
        # Combine summaries
        content = "\n\n".join(
            [
                "\n".join(texts),
                "Summary of Tables:\n" + "\n".join(table_summaries),
                "Summary of Images:\n" + "\n".join(image_summaries),
            ]
        )
        # Save content to a file with the same name as the PDF
        output_path = os.path.join(
            output_dir, f"{os.path.splitext(filename)[0]}.txt"
        )
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"Generated summary for {filename} saved to {output_path}")
        time.sleep(60)
        print("sleeping for 60 seconds")



Generated summaries for 3 tables
Generated summary for About the Handbook _ The GitLab Handbook.pdf saved to ./generated_content\About the Handbook _ The GitLab Handbook.txt
sleeping for 60 seconds
Generated summaries for 0 tables
Generated summary for CEO _ The GitLab Handbook.pdf saved to ./generated_content\CEO _ The GitLab Handbook.txt
sleeping for 60 seconds
Generated summaries for 0 tables
Generated summary for Customer Success _ The GitLab Handbook.pdf saved to ./generated_content\Customer Success _ The GitLab Handbook.txt
sleeping for 60 seconds
Generated summaries for 0 tables
Generated summary for Data team GitLab documentation.pdf saved to ./generated_content\Data team GitLab documentation.txt
sleeping for 60 seconds
Generated summaries for 11 tables
Generated summary for Data Team Organization _ The GitLab Handbook.pdf saved to ./generated_content\Data Team Organization _ The GitLab Handbook.txt
sleeping for 60 seconds
Generated summaries for 0 tables
Generated summary for 

The PDF <_io.BufferedReader name='./data\\Expense Reports.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


sleeping for 60 seconds
Generated summaries for 2 tables
Generated summary for Expense Reports.pdf saved to ./generated_content\Expense Reports.txt
sleeping for 60 seconds
Generated summaries for 3 tables
Generated summary for Finance _ The GitLab Handbook.pdf saved to ./generated_content\Finance _ The GitLab Handbook.txt
sleeping for 60 seconds
Generated summaries for 8 tables
Generated summary for GitLab Values _ The GitLab Handbook.pdf saved to ./generated_content\GitLab Values _ The GitLab Handbook.txt
sleeping for 60 seconds
Generated summaries for 5 tables
Generated summary for messy_customer_support_logs_large.pdf saved to ./generated_content\messy_customer_support_logs_large.txt
sleeping for 60 seconds
Generated summaries for 0 tables
Generated summary for Org structure subsidy TC.pdf saved to ./generated_content\Org structure subsidy TC.txt
sleeping for 60 seconds
Generated summaries for 0 tables
Generated summary for Triage Operations _ The GitLab Handbook.pdf saved to ./gene

KeyboardInterrupt: 

In [13]:
texts

['22/10/2024, 14:54\n\nAbout the Handbook | The GitLab Handbook\n\nThe GitLab Handbook\n\nGitLab TeamOps Handbook Job Families Reports\n\nAbout the Handbook\n\nHistory of the handbook\n\nThe handbook started when GitLab was a company of just ten people to make sharing information efficient and easy. We knew that future GitLab team-members wouldnʼt be able to see emails about process changes that were being sent before they joined and that most of the people who would eventually join GitLab likely hadnʼt even heard of us yet. The handbook was our way of ensuring that all of our company information was accessible to everyone regardless of when they became part of the team.\n\nAdvantages\n\nAt GitLab our handbook is extensive and keeping it relevant is an important part of\n\neveryoneʼs job. It is a vital part of who we are and how we communicate. We established these processes because we saw these benefits:\n\n. Reading is much faster than listening.\n\n. Reading is async, you donʼt have

In [None]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model="llama3")

llm.invoke("Hello")

./figures/About the Handbook _ The GitLab Handbook
./data\About the Handbook _ The GitLab Handbook.pdf
./figures/CEO _ The GitLab Handbook
./data\CEO _ The GitLab Handbook.pdf


KeyboardInterrupt: 

In [4]:
curr_fig_path="figures\About the Handbook _ The GitLab Handbook"

  curr_fig_path="figures\About the Handbook _ The GitLab Handbook"
