# Chunking text strategies

https://www.pinecone.io/learn/chunking-strategies/

In [3]:
import os
from tqdm import tqdm
import numpy as np

In [4]:
from langchain.docstore.document import Document
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)

    docs = []
    for page_num in range(doc.page_count):
        page = doc[page_num]
        docs.append(Document(page_content=page.get_text(), metadata={'page': page_num, 'source': pdf_path}))
    doc.close()

    return docs

def process_pdfs_in_folder(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        print(file_path)
        if os.path.isdir(file_path):
            # If it's a folder, call the function recursively
            process_pdfs_in_folder(file_path)
        elif filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            docs = extract_text_from_pdf(pdf_path)
            all_docs.append(docs)
    return all_docs

In [7]:
# Set your folder path, chunk size, and overlap
folder_path = "investigations/sample"

loaded_documents = process_pdfs_in_folder(folder_path)
# convert list of lists to a single list
loaded_documents = [item for sublist in loaded_documents for item in sublist]

print(f"Length of loaded pages: {len(loaded_documents)}")

investigations/sample\AFM_annualreport_2022.pdf
investigations/sample\mckinsey-tech-trends-outlook-2022-full-report.pdf
investigations/sample\mgi-reinventing-construction-a-route-to-higher-productivity-full-report.pdf
investigations/sample\Procter&Gamble_annualreport_2023.pdf
investigations/sample\the-state-of-organizations-2023.pdf
Length of loaded pages: 724


## Recursive Chunking from Langchain (512 tokens and 64 overlap)

Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators. If the initial attempt at splitting the text doesn’t produce chunks of the desired size or structure, the method recursively calls itself on the resulting chunks with a different separator or criterion until the desired chunk size or structure is achieved. This means that while the chunks aren’t going to be exactly the same size, they’ll still “aspire” to be of a similar size.

In [23]:
import os
import glob
from typing import List
from multiprocessing import Pool
from tqdm import tqdm
from langchain.document_loaders import CSVLoader, PyPDFLoader, Docx2txtLoader
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader

In [24]:
# Map file extensions to document loaders and their arguments
loaders_mapping = {
    ".csv": CSVLoader,
    ".docx": Docx2txtLoader,
    ".pdf": PyPDFLoader
}

def load_document(file_path: str) -> Document:
    ## Find extension of the file
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in loaders_mapping:
        # Find the appropriate loader class
        loader_class = loaders_mapping[ext]
        # Invoke the instance of document loader
        loader = loader_class(file_path)
        ## Return the loaded document
        return loader.load()
    else:
        raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str) -> List[Document]:
    """
    Loads all documents from the source documents directory
    """
    all_files = []
    for ext in loaders_mapping:
        #Find all the files within source documents which matches the extensions in loaders_mapping
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
        )

    ## Spinning up resource pool
    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(all_files), desc='Loading new documents', ncols=80) as pbar:
            # Load each document from filtered files list using load_single_document function
            for i, doc in enumerate(pool.imap_unordered(load_document, all_files)):
                results.extend(doc)
                pbar.update()

    return results

In [25]:
loaded_documents = load_documents("sample")
print(f"Length of loaded documents: {len(loaded_documents)}")

Loading new documents: 100%|██████████████████████| 1/1 [00:17<00:00, 17.58s/it]

Length of loaded documents: 88





In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split docs in chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64
)

rec_texts = text_splitter.split_documents(loaded_documents)
len(rec_texts)

813

In [27]:
# Print or do something with the chunks
for i, chunk in enumerate(rec_texts):
    print(f"Chunk {i + 1}:")
    print(chunk)
    print("\n" + "="*20 + "\n")

Chunk 1:
page_content='2023  \nAnnual  \nReport' metadata={'source': 'sample/Procter&Gamble_annualreport_2023.pdf', 'page': 0}


Chunk 2:
page_content='FINANCIAL HIGHLIGHTS (UNAUDITED) \nAmounts in billions, except per share amounts\n2023 2022 2021 2020 2019\nNet Sales $82.0 $80.2 $76.1 $71.0 $67.7\nOperating Income $18.1 $17.8 $18.0 $15.7 $5.5\nNet Earnings  \nAttributable to P&G$14.7 $14.7 $14.3 $13.0 $3.9\nNet Earnings Margin 18.0% 18.4% 18.9% 18.5% 5.9%\nDiluted Net Earnings  per Common Share\u200a\n1$5.90 $5.81 $5.50 $4.96 $1.43\nCore Earnings  per Share\u2009\n2$5.90 $5.81 $5.66 $5.12 $4.52\nOperating Cash Flow $16.8 $16.7 $18.4 $17.4 $15.2' metadata={'source': 'sample/Procter&Gamble_annualreport_2023.pdf', 'page': 1}


Chunk 3:
page_content='Operating Cash Flow $16.8 $16.7 $18.4 $17.4 $15.2\nDividends per  Common Share$3.68 $3.52 $3.24 $3.03 $2.902023 NET SALES BY BUSINESS SEGMENT 3\n  \n  \n  \n \n \n2\n023 NET SALES BY GEOGRAPHIC REGION\n \n \n \n \n \n(1) Diluted net earnings

## Advanced NLP chunking

spaCy is another powerful Python library for NLP tasks. It offers a sophisticated sentence segmentation feature that can efficiently divide the text into separate sentences, enabling better context preservation in the resulting chunks.

In [28]:
SpacyTextSplitter.__init__.__code__.co_varnames[1:]

('separator', 'pipeline', 'max_length', 'kwargs')

In [33]:
import os
from langchain.text_splitter import SpacyTextSplitter

max_chunk_size = 250

# split docs in chunks
text_splitter = SpacyTextSplitter(
    chunk_size=max_chunk_size,
    chunk_overlap=64
)

nlp_texts = text_splitter.split_documents(loaded_documents)
len(nlp_texts)



1505

In [34]:
# Print or do something with the chunks
for i, chunk in enumerate(nlp_texts):
    print(f"Chunk {i + 1}:")
    print(chunk)
    print("\n" + "="*20 + "\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Chunk 506:
page_content='These are 1) leveraging environmental sustainability as an additiona l \ndriver of su perior performing products and packaging innovations, 2) increasing digital acumen to drive consumer and  \ncustomer preference, reduce cost and enable rapid and efficient decision making, 3) developing next -level supply chain \ncapabilities to enable flexibil ity, agility, resilience and a new level of productivity and 4) delivering employee value equation  \nfor all gender identities, races, ethnicities, sexual orientations, ages and abilities for all roles to ensure we continue to  attract, \nretain and develop th e best talent.' metadata={'source': 'sample/Procter&Gamble_annualreport_2023.pdf', 'page': 27}


Chunk 507:
page_content='We believe this strategy is right for the long -term health of the Company and our objective of delivering total shareholder return \nin the top one -third of our peer group.' me

###Into csv for comparison

In [None]:
import pandas as pd

rec_df = pd.DataFrame(columns= [
    'recursive_content',
    'recrusive_source'])

nlp_df = pd.DataFrame(columns= [
    'nlp_content',
    'nlp_source'])

rec_df['content'] = [document.page_content for document in rec_texts]
nlp_df['content'] = [document.page_content for document in nlp_texts]
rec_df['source'] = [document.metadata for document in rec_texts]
nlp_df['source'] = [document.metadata for document in nlp_texts]

In [None]:
rec_df.to_csv('recursive_text_chunks.csv', index=False)
nlp_df.to_csv('advNLP_chunks.csv', index=False)

## Advanced NLP chunking with further splitting with recusrive to have max length

In [54]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create a new list to store the updated documents
new_nlp_texts = []

# Define the new text splitter
new_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_chunk_size,
    chunk_overlap=64
)

# Iterate through each document in nlp_texts
for chunk in nlp_texts:
    # Split each chunk in the document using the new text splitter
    new_chunks = []
    # Check if the chunk's length (in tokens) is greater than max_chunk_size
    if len(chunk.tokens) > max_chunk_size:
        # If yes, further split the chunk using the new text splitter
        new_sub_chunks = new_text_splitter.split_text(chunk.text)
        new_chunks.extend(new_sub_chunks)
    else:
        # If no, keep the original chunk
        new_chunks.append(chunk)

    # Create a new document with the updated chunks
    new_document = Document(new_chunks)

    # Append the new document to the list
    new_nlp_texts.append(new_document)


TypeError: Serializable.__init__() takes 1 positional argument but 2 were given

## Hierarchical topic segmentation text chunking

### First analyse optimal nbr of clusers

The term "Cost" in the context of the K-means clustering algorithm usually refers to the inertia or within-cluster sum of squares. Inertia measures how far the points within a cluster are from the centroid of that cluster. The goal of K-means is to minimize the inertia, indicating that the clusters are tight and well-separated.

In [None]:
import os
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import nltk
import matplotlib.pyplot as plt

# Download BERT model from nltk
nltk.download('punkt')

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def embed_text_with_bert(text, model):
    sentences = nltk.sent_tokenize(text)
    embeddings = model.encode(sentences)
    return sentences, embeddings

def find_optimal_clusters(embeddings, max_clusters=100):
    costs = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(embeddings)
        costs.append(kmeans.inertia_)
    return costs

def plot_elbow(costs):
    plt.plot(range(1, len(costs) + 1), costs, marker='o')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Cost (Inertia)')
    plt.show()

# Specify the folder containing PDF files
pdf_folder = 'dataset'

# Load BERT model for embedding
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Process each PDF file in the folder
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)

        # Read the PDF content
        pdf_text = read_pdf(pdf_path)

        # Embed the text using BERT
        sentences, embeddings = embed_text_with_bert(pdf_text, bert_model)

        # Find the optimal number of clusters using the elbow method
        costs = find_optimal_clusters(embeddings)
        plot_elbow(costs)


### Now, test with max nbr of tokens per cluster