In [None]:
gcloud auth login --no-launch-browser
# i used this command in the terminal to connect to Google Cloud

In [None]:
%pip install -q google-cloud-storage unstructured langchain python-magic sqlalchemy langchain_google_cloud_sql_pg
%pip install -q "unstructured[pptx]"
%pip install -q "unstructured[pdf]"

In [1]:
import os
from langchain_core.documents import Document
from langchain_unstructured import UnstructuredLoader
from PIL import Image

#repertoire pour les fichiers
LOCAL_DIRECTORY = "./downloaded_files"

def read_local_files(directory: str) -> list[Document]:
    documents = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            try:
                loader = UnstructuredLoader(filepath)
                docs = loader.lazy_load()
                for doc in docs:
                    doc.metadata["source"] = filepath
                    documents.append(doc)
            except Exception as e:
                print(f"An error occurred while processing '{filename}': {e}")
        elif filename.endswith(".pptx"):
            try:
                loader = UnstructuredLoader(filepath)
                docs = loader.lazy_load()
                for doc in docs:
                    doc.metadata["source"] = filepath
                    documents.append(doc)
            except Exception as e:
                print(f"An error occurred while processing '{filename}': {e}")
        elif filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg"):
            try:
                image = Image.open(filepath)
                doc = Document(
                    page_content="", 
                    metadata={"source": filepath, "content_type": "image"}
                )
                documents.append(doc)
            except Exception as e:
                print(f"An error occurred while processing '{filename}': {e}")
    return documents

documents = read_local_files(LOCAL_DIRECTORY)

  from .autonotebook import tqdm as notebook_tqdm
INFO: pikepdf C++ to Python logger bridge initialized


In [None]:

for doc in documents:
    print("-" * 50)
    print("Page Content: ", doc.page_content)
    print("Metadata: ", doc.metadata)

--------------------------------------------------
Page Content:  
Metadata:  {'source': './downloaded_files\\66ffbe0e174359fb2d2cdc3a_65f064b0c59feeb65a00d202_ESG_schema-FR2.jpeg', 'content_type': 'image'}
--------------------------------------------------
Page Content:  
Metadata:  {'source': './downloaded_files\\definition-rse-esg-FR-Capterra-infographic-1-1.jpg', 'content_type': 'image'}
--------------------------------------------------
Page Content:  
Metadata:  {'source': './downloaded_files\\ESG-Image-1-1-1024x499.jpg', 'content_type': 'image'}
--------------------------------------------------
Page Content:  1
Metadata:  {'source': './downloaded_files\\ESG-Presentation.pptx', 'category_depth': 0, 'file_directory': './downloaded_files', 'filename': 'ESG-Presentation.pptx', 'last_modified': '2025-02-26T16:44:37', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'category': 'UncategorizedText', 'eleme

In [9]:
%pip install pymupdf

Collecting pymupdfNote: you may need to restart the kernel to use updated packages.

  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
    --------------------------------------- 0.3/16.5 MB ? eta -:--:--
   -- ------------------------------------- 1.0/16.5 MB 3.4 MB/s eta 0:00:05
   ----- ---------------------------------- 2.1/16.5 MB 3.8 MB/s eta 0:00:04
   ------ --------------------------------- 2.9/16.5 MB 4.0 MB/s eta 0:00:04
   --------- ------------------------------ 3.9/16.5 MB 4.3 MB/s eta 0:00:03
   --------- ------------------------------ 3.9/16.5 MB 4.3 MB/s eta 0:00:03
   ---------- ----------------------------- 4.2/16.5 MB 3.4 MB/s eta 0:00:04
   ---------- ----------------------------- 4.5/16.5 MB 3.0 MB/s eta 0:00:05
   ----------- ---------------------------- 4.7/16.5 MB 2.7 MB/s eta 0:00:05
   ------------ -----------

In [None]:
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_embedding(image: Image.Image) -> list[float]:
    """
    Generates embeddings for an image using OpenAI's CLIP model.

    Args:
        image (Image.Image): A PIL Image object.

    Returns:
        list[float]: A list of floats representing the image embedding.
    """
    try:
        inputs = processor(images=image, return_tensors="pt", padding=True)
        outputs = model.get_image_features(**inputs)
        embedding = outputs.tolist()[0] 
        return embedding
    except Exception as e:
        print(f"An error occurred while generating image embedding: {e}")
        return None



In [None]:

image = Image.open("./downloaded_files/images.png")  
embedding = get_image_embedding(image)

if embedding:
    print(f"Image embedding: {embedding}")
    print(f"Embedding length: {len(embedding)}")
else:
    print("Failed to generate embedding.")

Image embedding: [0.04763064906001091, -0.1022099107503891, -0.45288795232772827, 0.18605193495750427, -0.07452479749917984, -0.223678320646286, -0.3937239348888397, -0.06860151141881943, 0.8032571077346802, 0.2813357710838318, 0.32110559940338135, 0.09197026491165161, -0.5962035655975342, 0.18754452466964722, 0.815786600112915, 0.057586729526519775, -0.0011720359325408936, 0.3954155147075653, -0.06109168380498886, 0.5819792747497559, -0.4001431167125702, 0.16356688737869263, 0.3837440013885498, 0.03406720235943794, 0.3783265948295593, 0.4480671286582947, 0.13482415676116943, -0.3802879750728607, 0.003282502293586731, -0.37510740756988525, 0.09706033021211624, 0.31565868854522705, 0.43176791071891785, -0.29880979657173157, 0.6727957725524902, -0.012955516576766968, 0.029577195644378662, 0.35946425795555115, -0.07227019965648651, -0.5875550508499146, -0.3792976140975952, 0.16938374936580658, -0.29182976484298706, 0.159734845161438, -0.22557157278060913, -0.7507195472717285, 0.2188424915

In [13]:
from collections import defaultdict

def merge_documents_by_page(documents: list[Document]) -> list[Document]:
    merged_documents: list[Document] = []
    page_dict = {}

    # Group documents by page number
    for doc in documents:
        document_source = doc.metadata.get("source")
        page_number = doc.metadata.get("page_number")

        if page_number is not None and document_source is not None:
            key = (document_source, page_number)
            if key not in page_dict:
                page_dict[key] = [doc]
            else:
                page_dict[key].append(doc)

    # Merge documents for each page
    for (document_source, page_number), docs in page_dict.items():
        if docs:
            # Use the metadata of the first document in the group
            merged_metadata = docs[0].metadata
            # Concatenate the page content of all documents in the group
            merged_content = "\n".join(doc.page_content for doc in docs)
            # Create a new Document with merged content and metadata
            merged_documents.append(
                Document(metadata=merged_metadata, page_content=merged_content)
            )

    return merged_documents

# Merge the documents by page
merged_documents = merge_documents_by_page(documents)

In [14]:
print(merged_documents)

[Document(metadata={'source': './downloaded_files\\ESG-Presentation.pptx', 'category_depth': 0, 'file_directory': './downloaded_files', 'filename': 'ESG-Presentation.pptx', 'last_modified': '2025-02-26T16:44:37', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'category': 'UncategorizedText', 'element_id': '0b363374b1c5fd31b1534c68bbc5c3c5'}, page_content='1\nAn Introduction to ESG Investing\nMike Seagrove & Calum Butt\n'), Document(metadata={'source': './downloaded_files\\ESG-Presentation.pptx', 'category_depth': 1, 'file_directory': './downloaded_files', 'filename': 'ESG-Presentation.pptx', 'last_modified': '2025-02-26T16:44:37', 'page_number': 2, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'category': 'Title', 'element_id': '73710c008efe174cda61a695cc115d2c'}, page_content='Introduction to ESG Investing\nThe World Needs to Change\nBeing 

In [None]:
%pip install psycopg2

Collecting psycopg2Note: you may need to restart the kernel to use updated packages.

  Using cached psycopg2-2.9.10-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Using cached psycopg2-2.9.10-cp312-cp312-win_amd64.whl (1.2 MB)
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.10


In [None]:
import os
from glob import glob
image_folder = "./images"

image_files = glob(os.path.join(image_folder, "*.jpg")) + \
               glob(os.path.join(image_folder, "*.jpeg")) + \
               glob(os.path.join(image_folder, "*.png"))

print(f"Found {len(image_files)} image files: {image_files}")

Found 11 image files: ['./downloaded_files\\definition-rse-esg-FR-Capterra-infographic-1-1.jpg', './downloaded_files\\ESG-Image-1-1-1024x499.jpg', './downloaded_files\\Smoki-Wins-esg.jpg', './downloaded_files\\66ffbe0e174359fb2d2cdc3a_65f064b0c59feeb65a00d202_ESG_schema-FR2.jpeg', './downloaded_files\\images.jpeg', './downloaded_files\\graphs-the-distribution-of-raw-scores-for-the-ESG-rating-over-time-and-shows-that-over.png', './downloaded_files\\images (1).png', './downloaded_files\\images (2).png', './downloaded_files\\images.png', './downloaded_files\\les-criteres-esg-exemples.png', './downloaded_files\\Screenshot-2022-12-23-at-12.20.30.png']


In [17]:
import os
from config import PROJECT_ID, REGION, INSTANCE, DATABASE, DB_USER
DB_PASSWORD = os.environ["DB_PASSWORD"]

TABLE_NAME = "hibak_azzac_project_text"

from langchain_google_cloud_sql_pg import PostgresEngine

# Connect to the PostgreSQL database
engine = PostgresEngine.from_instance(
    project_id=PROJECT_ID,
    instance=INSTANCE,
    region=REGION,
    database=DATABASE,
    user=DB_USER,
    password=DB_PASSWORD,
)

# Create a table in the PostgreSQL database with the required columns
from sqlalchemy.exc import ProgrammingError

try:
    await engine.ainit_vectorstore_table(
        table_name=TABLE_NAME,
        vector_size=768,
    )
except ProgrammingError:
    print("Table already created")

from langchain_google_vertexai import VertexAIEmbeddings

embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest",
    project=PROJECT_ID
)

from langchain_google_cloud_sql_pg import PostgresVectorStore

vector_store = PostgresVectorStore.create_sync(
    engine=engine,
    table_name=TABLE_NAME,
    embedding_service=embedding,
)

# Add the merged documents to the vector store
vector_store.add_documents(merged_documents)

# Retriever setup
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5}
)


In [18]:
# Example query
query = "what is esg"
docs = retriever.get_relevant_documents(query)

for doc in docs:
    print("-" * 50)
    print("Content: ", doc.page_content)
    print("Metadata: ", doc.metadata)

  docs = retriever.get_relevant_documents(query)


--------------------------------------------------
Content:  ESG
Metadata:  {'source': './downloaded_files\\IR-ESG-PPT-4Q22.pdf', 'coordinates': {'points': [[133.82, 255.082], [133.82, 291.082], [219.86, 291.082], [219.86, 255.082]], 'system': 'PixelSpace', 'layout_width': 960, 'layout_height': 540}, 'file_directory': './downloaded_files', 'filename': 'IR-ESG-PPT-4Q22.pdf', 'languages': ['eng'], 'last_modified': '2025-02-26T16:46:24', 'page_number': 8, 'filetype': 'application/pdf', 'category': 'Title', 'element_id': 'bddd3c3ade44bb86a9fced43dbde9b65'}
--------------------------------------------------
Content:  Where are you on the ESG spectrum?
ESG is overcooked…
ESG is going to change everything…
“Climate change is not a financial risk we need to worry about”  
“ESG no longer a ‘woke’ agenda, it is now BAU”					
Stuart Kirk – Ex HSBC
Meg Lee – Hall&Willcox
“Amsterdam has been six meters underwater for ages, and that’s a really nice place. We will cope with it.”
“ESG is business as u

In [None]:
import os
from config import PROJECT_ID, REGION, INSTANCE, DATABASE, DB_USER
from langchain_google_cloud_sql_pg import PostgresEngine
from sqlalchemy.exc import ProgrammingError

DB_PASSWORD = os.environ["DB_PASSWORD"]

IMAGE_TABLE_NAME = "hibak_azzac_project_images"

# Connect to the PostgreSQL database
engine = PostgresEngine.from_instance(
    project_id=PROJECT_ID,
    instance=INSTANCE,
    region=REGION,
    database=DATABASE,
    user=DB_USER,
    password=DB_PASSWORD,
)

# Create a table in the PostgreSQL database for image embeddings
try:
    await engine.ainit_vectorstore_table(
        table_name=IMAGE_TABLE_NAME,
        vector_size=512,  # CLIP embeddings are 512-dimensional
    )
    print(f"Table '{IMAGE_TABLE_NAME}' created successfully.")
except ProgrammingError:
    print(f"Table '{IMAGE_TABLE_NAME}' already exists.")
except Exception as e:
    print(f"An error occurred while creating the table: {e}")

Table 'hibak_azzac_project_images' created successfully.


In [None]:
import os
from PIL import Image
import torch
from config import PROJECT_ID, REGION, INSTANCE, DATABASE, DB_USER
from langchain_google_cloud_sql_pg import PostgresEngine
from glob import glob


def connect_to_database():
    """Connects to the PostgreSQL database and returns the engine and table name."""
    DB_PASSWORD = os.environ["DB_PASSWORD"]

    IMAGE_TABLE_NAME = "hibak_azzac_project_images"

    # Connect to the PostgreSQL database
    engine = PostgresEngine.from_instance(
        project_id=PROJECT_ID,
        instance=INSTANCE,
        region=REGION,
        database=DATABASE,
        user=DB_USER,
        password=DB_PASSWORD,
    )

    return engine, IMAGE_TABLE_NAME



In [None]:
# Process each image file
for img_path in image_files:
    try:
        # Open the image using PIL
        image = Image.open(img_path)
        # Generate the embedding
        embedding = get_image_embedding(image)
        if embedding:
            print(f"Generated embedding for {img_path}:")
            print(embedding)  #embedding vector
            print(f"Embedding length: {len(embedding)}")  #length of the embedding
            print("-" * 50) 
    except Exception as e:
        print(f"An error occurred while processing {img_path}: {e}")

Generated embedding for ./images\img2.jpg:
[-0.4328739643096924, -0.30402839183807373, 0.04915030300617218, 0.024730637669563293, -0.023554254323244095, -0.23129865527153015, 0.2671767771244049, -0.01899532973766327, 0.4524613320827484, 0.03605195879936218, 0.03710119053721428, 0.25157058238983154, -0.2555978298187256, -0.2982223927974701, 0.01978575624525547, 0.04471907392144203, 0.20295190811157227, 0.11868172883987427, 0.22038012742996216, 0.4686555862426758, 0.4288938641548157, 0.6878823637962341, 0.17243725061416626, 0.14334629476070404, -0.4218912124633789, 0.31755954027175903, -0.19471557438373566, -0.051802750676870346, 0.357422411441803, 0.5261783003807068, 0.33525025844573975, 0.17928509414196014, -0.12491033971309662, 0.4743208885192871, 0.2943010926246643, -0.33864179253578186, 0.5299240350723267, 0.18493416905403137, 0.4487111270427704, -1.1680649518966675, -0.16907286643981934, -0.41720521450042725, -0.4300912618637085, 0.0380372628569603, -0.275659441947937, -0.136020913

In [None]:
import os
from config import PROJECT_ID, REGION, INSTANCE, DATABASE, DB_USER
from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore
from sqlalchemy.exc import ProgrammingError
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from typing import List
from langchain_core.documents import Document

DB_PASSWORD = os.environ["DB_PASSWORD"]

IMAGE_TABLE_NAME = "hibak_azzac_project_images"

# Connect to the PostgreSQL database
engine = PostgresEngine.from_instance(
    project_id=PROJECT_ID,
    instance=INSTANCE,
    region=REGION,
    database=DATABASE,
    user=DB_USER,
    password=DB_PASSWORD,
)

# Create a table in the PostgreSQL database for image embeddings
try:
    await engine.ainit_vectorstore_table(
        table_name=IMAGE_TABLE_NAME,
        vector_size=512,  
    )
    print(f"Table '{IMAGE_TABLE_NAME}' created successfully.")
except ProgrammingError:
    print(f"Table '{IMAGE_TABLE_NAME}' already exists.")
except Exception as e:
    print(f"An error occurred while creating the table: {e}")

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

class CLIPEmbeddings:
    def __init__(self, model, processor):
        self.model = model
        self.processor = processor
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """This method needs to be implemented for compatibility"""
        embeddings = []
        for text in texts:
            if os.path.exists(text):
                try:
                    image = Image.open(text)
                    embedding = self.get_image_embedding(image)
                    embeddings.append(embedding)
                except Exception as e:
                    print(f"Error embedding image {text}: {e}")
                    #zero vector if there's an error
                    embeddings.append([0.0] * 512)
            else:
                #zero vector for non image content
                embeddings.append([0.0] * 512)
        return embeddings
    
    def embed_query(self, text: str) -> List[float]:
        """This would be used for text queries to find similar images"""
        inputs = self.processor(text=text, return_tensors="pt", padding=True)
        with torch.no_grad():
            text_features = self.model.get_text_features(**inputs)
        return text_features.detach().numpy().tolist()[0]
    
    def get_image_embedding(self, image: Image.Image) -> List[float]:
        """Generate embeddings for an image using CLIP"""
        try:
            inputs = self.processor(images=image, return_tensors="pt", padding=True)
            with torch.no_grad():
                outputs = self.model.get_image_features(**inputs)
            embedding = outputs.detach().numpy().tolist()[0]
            return embedding
        except Exception as e:
            print(f"An error occurred while generating image embedding: {e}")
            return [0.0] * 512  

clip_embedding_service = CLIPEmbeddings(model, processor)

#create vector store with embedding service
vector_store = PostgresVectorStore.create_sync(
    engine=engine,
    table_name=IMAGE_TABLE_NAME,
    embedding_service=clip_embedding_service,
)

#process each image file and add to vector store
image_files = [
    "images/img1.jpeg", "images/img2.jpg", "images/img3.jpg", 
    "images/img4.png", "images/img5.png", "images/img6.png",
    "images/img7.jpeg", "images/img8.png", "images/img9.png", 
    "images/img10.png"
]

for img_path in image_files:
    try:
        #open image using PIL
        image = Image.open(img_path)
        
        #create a Document object with metadata
        #use image path as the page_content
        doc = Document(
            page_content=img_path, 
            metadata={
                "source": img_path,
                "filename": os.path.basename(img_path),
                "image_width": image.width,
                "image_height": image.height,
                "image_format": image.format,
            }
        )

        vector_store.add_documents([doc])
        
        print(f"Added {img_path} to vector store.")
            
    except Exception as e:
        print(f"An error occurred while processing {img_path}: {e}")

retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5}
)

Table 'hibak_azzac_project_images' already exists.
Added images/img1.jpeg to vector store.
Added images/img2.jpg to vector store.
Added images/img3.jpg to vector store.
Added images/img4.png to vector store.
Added images/img5.png to vector store.
Added images/img6.png to vector store.
Added images/img7.jpeg to vector store.
Added images/img8.png to vector store.
Added images/img9.png to vector store.
Added images/img10.png to vector store.


In [None]:
import os
from config import PROJECT_ID, REGION, INSTANCE, DATABASE, DB_USER
from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from typing import List
from langchain_core.documents import Document


def test_text_to_image_retrieval():
    """Test retrieving images based on text descriptions"""
    print("=" * 50)
    print("TESTING TEXT-TO-IMAGE RETRIEVAL")
    print("=" * 50)
    

    test_queries = [
        "esg"  
    ]
    
    for query in test_queries:
        print(f"\nSearching for images matching: '{query}'")
        #use retriever to find matching images
        results = retriever.get_relevant_documents(query)
        
        if results:
            print(f"Found {len(results)} matching images:")
            for i, doc in enumerate(results, 1):
                print(f"  {i}. {doc.metadata.get('filename', 'Unknown')} (source: {doc.metadata.get('source', 'Unknown')})")
        else:
            print("No matching images found.")

def test_image_to_image_retrieval():
    """Test retrieving similar images based on an example image"""
    print("\n" + "=" * 50)
    print("TESTING IMAGE-TO-IMAGE RETRIEVAL")
    print("=" * 50)
    
    #test image
    test_image_path = "images/img5.png" 
    
    try:
        print(f"\nSearching for images similar to: {test_image_path}")
        #generate embedding for test image
        test_image = Image.open(test_image_path)
        test_embedding = clip_embedding_service.get_image_embedding(test_image)
        
        #use the embedding to search for similar images
        from langchain_core.vectorstores import VectorStoreRetriever

        if hasattr(retriever, "vectorstore"):
            results = retriever.vectorstore.similarity_search_by_vector(
                test_embedding, 
                k=5
            )
            
            if results:
                print(f"Found {len(results)} similar images:")
                for i, doc in enumerate(results, 1):
                    print(f"  {i}. {doc.metadata.get('filename', 'Unknown')} (source: {doc.metadata.get('source', 'Unknown')})")
            else:
                print("No similar images found.")
        else:
            print("Retriever doesn't support vector similarity search.")
    except Exception as e:
        print(f"Error testing image-to-image retrieval: {e}")

def check_database_content():
    """Check how many images are stored in the database"""
    print("\n" + "=" * 50)
    print("CHECKING DATABASE CONTENT")
    print("=" * 50)
    
    try:
        import sqlalchemy
        
        connection_string = engine.get_connection_string()
        db = sqlalchemy.create_engine(connection_string)
        conn = db.connect()
        
        result = conn.execute(sqlalchemy.text(f"SELECT COUNT(*) FROM {IMAGE_TABLE_NAME}"))
        count = result.scalar()
        
        print(f"Number of images stored in the database: {count}")
        
        result = conn.execute(sqlalchemy.text(f"SELECT id, content, metadata FROM {IMAGE_TABLE_NAME} LIMIT 3"))
        print("\nSample data from the database:")
        for row in result:
            print(f"ID: {row[0]}")
            print(f"Content: {row[1]}")
            print(f"Metadata: {row[2]}")
            print("-" * 30)
        
        conn.close()
    except Exception as e:
        print(f"Error checking database content: {e}")

if __name__ == "__main__":
    check_database_content()
    test_text_to_image_retrieval()
    test_image_to_image_retrieval()


CHECKING DATABASE CONTENT
Error checking database content: 'PostgresEngine' object has no attribute 'get_connection_string'
TESTING TEXT-TO-IMAGE RETRIEVAL

Searching for images matching: 'esg'




No matching images found.

TESTING IMAGE-TO-IMAGE RETRIEVAL

Searching for images similar to: images/img5.png
Found 5 similar images:
  1. img5.png (source: images/img5.png)
  2. img4.png (source: images/img4.png)
  3. img8.png (source: images/img8.png)
  4. img10.png (source: images/img10.png)
  5. img1.jpeg (source: images/img1.jpeg)


In [17]:
%pip install ratelimit

Collecting ratelimit
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: ratelimit
  Building wheel for ratelimit (pyproject.toml): started
  Building wheel for ratelimit (pyproject.toml): finished with status 'done'
  Created wheel for ratelimit: filename=ratelimit-2.2.1-py3-none-any.whl size=5975 sha256=296dd576d62ca179a633bb47f480d8589e81710537e3ca259648252a321bf3d4
  Stored in directory: c:\users\msi\appdata\local\pip\cache\wheels\69\bd\e0\4a5dee2a1bfbc8e258f543f92940e2b494d63b5be8144ec8c4
Successfully built ratelimit
Installing collected packages: ratelimit
Successfully installed ratelimit-2.2.1
Note: you may need to restar

In [3]:
documents = read_local_files(LOCAL_DIRECTORY)
print(f"Total documents read: {len(documents)}")
for doc in documents:
    print(f"Source: {doc.metadata['source']}, Content Type: {doc.metadata.get('content_type', 'text')}")

Total documents read: 1673
Source: ./downloaded_files\66ffbe0e174359fb2d2cdc3a_65f064b0c59feeb65a00d202_ESG_schema-FR2.jpeg, Content Type: image
Source: ./downloaded_files\definition-rse-esg-FR-Capterra-infographic-1-1.jpg, Content Type: image
Source: ./downloaded_files\ESG-Image-1-1-1024x499.jpg, Content Type: image
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Type: text
Source: ./downloaded_files\ESG-Presentation.pptx, Content Ty

In [None]:
merged_documents = merge_documents_by_page(documents)
print(f"Total merged documents: {len(merged_documents)}")
for doc in merged_documents:
    print(f"Source: {doc.metadata['source']}, Page Number: {doc.metadata.get('page_number', 'N/A')}")
    print(f"Content: {doc.page_content[:100]}...") 

Total merged documents: 94
Source: ./downloaded_files\ESG-Presentation.pptx, Page Number: 1
Content: 1
An Introduction to ESG Investing
Mike Seagrove & Calum Butt
...
Source: ./downloaded_files\ESG-Presentation.pptx, Page Number: 2
Content: Introduction to ESG Investing
The World Needs to Change
Being part of the solution
Sustainability
Wh...
Source: ./downloaded_files\ESG-Presentation.pptx, Page Number: 3
Content: The World Needs to Change
The world is facing many real challenges, important and in some cases urge...
Source: ./downloaded_files\ESG-Presentation.pptx, Page Number: 4
Content: The World Needs to Change
4
Energy Supply
Housing
Sourcing greener energy
Solar panels
Smart meters
...
Source: ./downloaded_files\ESG-Presentation.pptx, Page Number: 5
Content: The World Needs to Change
Sustainability – Balancing the claims of the present, against the claims o...
Source: ./downloaded_files\ESG-Presentation.pptx, Page Number: 6
Content: What is ESG Investing?
Social
Mass migration
We