# NB02- Explore SentenceSplitter

From NB01, we determined that SentenceSplitter from ollama-index's documentation is an effective way of chunking for our purposes. This notebook will attempt to implement it using the docs we have collected via crawling and then test them on the ChatUI.

## Setup: Initialise SentenceSplitter() arguments and set up embedding LLM 

In [2]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Assuming constants and imports are defined elsewhere
DEFAULT_CHUNK_SIZE = 500  # This means each chunk has at most 500 tokens
SENTENCE_CHUNK_OVERLAP = 50  # Example overlap
CHUNKING_REGEX = r"[^,\.;]+[,\.;]?"  # Simple sentence splitter regex
DEFAULT_PARAGRAPH_SEP = "\n\n"  # Paragraph separator


embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") 

embeddings = embed_model.get_text_embedding("Hello World!") 
print(len(embeddings))
print(embeddings)





1024
[-0.015347783453762531, 0.032198358327150345, 0.004012150689959526, -0.007076343521475792, -0.03421807661652565, 0.010364635847508907, 0.005171587225049734, 0.035093992948532104, 0.0225681122392416, 0.015681663528084755, 0.020849039778113365, -0.0038733554538339376, 0.019704800099134445, -0.00847347266972065, -0.0241085235029459, 0.024853482842445374, -0.01752498745918274, -0.0347379632294178, -0.00883396714925766, 0.004576738923788071, -0.029032930731773376, 0.005700716748833656, -0.091826431453228, -0.05334915220737457, -0.01873025856912136, 0.05672815442085266, 0.041659094393253326, -0.014728585258126259, 0.05186482146382332, 0.07056008279323578, -0.032065510749816895, -0.026547519490122795, 0.022859014570713043, -0.04784966632723808, -0.003298336174339056, -0.011454702354967594, 0.05072872340679169, -0.048405423760414124, -0.014219076372683048, -0.053499486297369, 0.022067135199904442, 0.00993543490767479, 0.025218771770596504, -0.034378718584775925, -0.051167313009500504, 0.0

## Load and process LSE docs

In [3]:
import PyPDF2
import re


def read_pdf(file_path):
    # Initialize a variable to hold all the text
    all_text = ""
    
    # Open the PDF file
    with open(file_path, "rb") as file:
        # Initialize a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Iterate through each page in the PDF
        for page in pdf_reader.pages:
            # Extract text from the page
            text = page.extract_text()
            if text:
                all_text += text  # Append the extracted text to all_text

    return all_text

confidentiality_text = read_pdf("../docs/ConfidentialityPolicy.pdf")
thesis_text = read_pdf("../docs/Formatting-and-binding-your-thesis-2021-22.pdf")
twenty_thirty_text = read_pdf("../docs/LSE-2030-booklet.pdf")
msc_markframes_text = read_pdf("../docs/MSc-Mark-Frame.pdf")

test_texts = [confidentiality_text, thesis_text, twenty_thirty_text, msc_markframes_text]

def clean_text(text):
    # Replace all newline characters with an empty string
    cleaned_text = re.sub(r'\n', ' ', text)
    # Replace two or more spaces with a single space
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    # Replace a space followed by a period with just a period
    cleaned_text = re.sub(r' \.', '.', cleaned_text)
    # Replace a space followed by a comma with just a comma
    cleaned_text = re.sub(r' ,', ',', cleaned_text)
    return cleaned_text

cleaned_texts = [clean_text(i) for i in test_texts]

In [3]:
import json
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large")

# Define a helper function to generate chunk entries
def generate_chunk_entry(doc_type, chunk_name, chunk_description, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk_description)
        return {
            "Type": doc_type,
            "Name": chunk_name,
            "Description": chunk_description,
            "Embedding": embedding  # Ensure the embedding is serializable
            
        }
    except Exception as e:
        print(f"Error computing embedding for chunk {chunk_name}: {e}")
        return None

# Define the function to generate a JSON entry for each document
def generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter):
    try:
        # Split the description into chunks
        sentence_chunks = splitter.split_text(description)
        chunks = []
        for i, chunk in enumerate(sentence_chunks, 1):
            chunk_entry = generate_chunk_entry(doc_type, f"{doc_name} - Part {i}", chunk, embed_model)
            if chunk_entry:
                chunks.append(chunk_entry)

        return {
            "Id": doc_id,
            "Name": doc_name,
            "Description": description,
            "Link": link,
            "Chunks": chunks
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None

# Assuming 'cleaned_texts' and 'SentenceSplitter' are defined elsewhere in your code
documents = [
    (1, "PDF", "Immigration Advice Confidentiality Policy", cleaned_texts[0], "https://info.lse.ac.uk/current-students/immigration-advice/assets/documents/Info-Sheets/ConfidentialityPolicy.pdf"),
    (2, "PDF", "Formatting and binding your thesis", cleaned_texts[1], "https://info.lse.ac.uk/current-students/phd-academy/assets/documents/Formatting-and-binding-your-thesis-2021-22.pdf"),
    (3, "PDF", "LSE 2030 Strategy", cleaned_texts[2], "https://www.lse.ac.uk/2030/assets/pdf/LSE-2030-booklet.pdf"),
    (4, "PDF", "MSc Mark-Frame", cleaned_texts[3], "https://www.lse.ac.uk/sociology/assets/documents/study/Assessment-and-Feedback/MSc-Mark-Frame.pdf"),
]

json_data = []

splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)

# Process each document to generate a JSON entry
for doc_id, doc_type, doc_name, description, link in documents:
    json_entry = generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter)
    if json_entry:
        json_data.append(json_entry)
    else:
        print(f"Failed to create JSON entry for {doc_name}")

# Save the JSON data to a file
json_file_path = "seed_lse_data.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")




JSON file created successfully at seed_lse_data.json


In [3]:
import PyPDF2
import re
import json
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

def read_pdf(file_path):
    all_text = []
    with open(file_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_number, page in enumerate(pdf_reader.pages, start=1):
            text = page.extract_text()
            if text:
                all_text.append((page_number, text))
    return all_text

def clean_text(text):
    cleaned_text = re.sub(r'\n', ' ', text)
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r' \.', '.', cleaned_text)
    cleaned_text = re.sub(r' ,', ',', cleaned_text)
    return cleaned_text

def generate_chunk_entry(doc_type, doc_name, chunk_description, page_number, chunk_number, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk_description)
        chunk_name = f"{doc_name} - Page {page_number}, Chunk {chunk_number}"
        return {
            "Type": doc_type,
            "Name": chunk_name,
            "Description": chunk_description,
            "PageNumber": page_number,
            "Embedding": embedding
        }
    except Exception as e:
        print(f"Error computing embedding for chunk {chunk_name}: {e}")
        return None

def generate_json_entry(doc_id, doc_type, doc_name, pages_text, link, splitter):
    try:
        chunks = []
        for page_number, text in pages_text:
            sentence_chunks = splitter.split_text(text)
            for chunk_number, chunk in enumerate(sentence_chunks, start=1):
                chunk_entry = generate_chunk_entry(doc_type, doc_name, chunk, page_number, chunk_number, embed_model)
                if chunk_entry:
                    chunks.append(chunk_entry)

        return {
            "Id": doc_id,
            "Name": doc_name,
            "Description": link,
            "Link": link,
            "Chunks": chunks
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None

embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large")
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)

pdf_files = ["../docs/ConfidentialityPolicy.pdf", "../docs/Formatting-and-binding-your-thesis-2021-22.pdf", "../docs/LSE-2030-booklet.pdf", "../docs/MSc-Mark-Frame.pdf", "../docs/bsc-handbook-21.22.pdf", "../docs/UG-Student-Handbook-Department-of-International-History-2023-24 (1).pdf", "../docs/Exam-Procedures-for-Candidates.pdf", "../docs/Spring-Exam-Timetable-2024-Final.pdf"]
documents_info = [
    (1, "PDF", "Immigration Advice Confidentiality Policy", "https://info.lse.ac.uk/current-students/immigration-advice/assets/documents/Info-Sheets/ConfidentialityPolicy.pdf"),
    (2, "PDF", "Formatting and binding your thesis", "https://info.lse.ac.uk/current-students/phd-academy/assets/documents/Formatting-and-binding-your-thesis-2021-22.pdf"),
    (3, "PDF", "LSE 2030 Strategy", "https://www.lse.ac.uk/2030/assets/pdf/LSE-2030-booklet.pdf"),
    (4, "PDF", "MSc Mark-Frame", "https://www.lse.ac.uk/sociology/assets/documents/study/Assessment-and-Feedback/MSc-Mark-Frame.pdf"),
    (5, "PDF", "BSc Economics Handbook 2021/22", "https://www.lse.ac.uk/economics/Assets/Documents/undergraduate-study/bsc-handbook-21.22.pdf"),
    (6, "PDF", "UG History Department Handbook 2023/24", "https://www.lse.ac.uk/International-History/Assets/Documents/student-handbooks/2023-24/UG-Student-Handbook-Department-of-International-History-2023-24.pdf"),
    (7, "PDF", "Exam Procedure for Candidates", "https://info.lse.ac.uk/current-students/services/assets/documents/Exam-Procedures-for-Candidates.pdf"),
    (8, "PDF", "Spring Exam Timetable 2024", "https://info.lse.ac.uk/current-students/services/assets/documents/Spring-Exam-Timetable-2024-Final.pdf")
]

json_data = []
for (doc_id, doc_type, doc_name, link), file_path in zip(documents_info, pdf_files):
    pages_text = [(page_number, clean_text(text)) for page_number, text in read_pdf(file_path)]
    json_entry = generate_json_entry(doc_id, doc_type, doc_name, pages_text, link, splitter)
    if json_entry:
        json_data.append(json_entry)

json_file_path = "seed_lse_data_2.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")


JSON file created successfully at seed_lse_data_2.json


I found that with the above method, it splits chunks based on sentences as well... this means that when a sentence ends, it stops the chunk, which isn't exactly desirable. Let's try to find a way to work around it.

I have found out that the issue is to do with the fact that we are enumerating based on pages, and in doing so we split based on pages. Let's just get rid of this.

In [5]:
import json
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large")

def clean_text(text):
    cleaned_text = re.sub(r'\n', ' ', text)
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r' \.', '.', cleaned_text)
    cleaned_text = re.sub(r' ,', ',', cleaned_text)
    return cleaned_text

# Define a helper function to generate chunk entries
def generate_chunk_entry(doc_type, chunk_name, chunk_description, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk_description)
        return {
            "Type": doc_type,
            "Name": chunk_name,
            "Description": chunk_description,
            "Embedding": embedding  # Ensure the embedding is serializable
            
        }
    except Exception as e:
        print(f"Error computing embedding for chunk {chunk_name}: {e}")
        return None

# Define the function to generate a JSON entry for each document
def generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter):
    try:
        # Split the description into chunks
        sentence_chunks = splitter.split_text(description)
        chunks = []
        for i, chunk in enumerate(sentence_chunks, 1):
            chunk_entry = generate_chunk_entry(doc_type, f"{doc_name} - Part {i}", chunk, embed_model)
            if chunk_entry:
                chunks.append(chunk_entry)

        return {
            "Id": doc_id,
            "Name": doc_name,
            "Description": description,
            "Link": link,
            "Chunks": chunks
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None

pdf_files = ["../docs/ConfidentialityPolicy.pdf", "../docs/Formatting-and-binding-your-thesis-2021-22.pdf", "../docs/LSE-2030-booklet.pdf", "../docs/MSc-Mark-Frame.pdf", "../docs/bsc-handbook-21.22.pdf", "../docs/UG-Student-Handbook-Department-of-International-History-2023-24 (1).pdf", "../docs/Exam-Procedures-for-Candidates.pdf", "../docs/Spring-Exam-Timetable-2024-Final.pdf"]
documents= [
    (1, "PDF", "Immigration Advice Confidentiality Policy", "https://info.lse.ac.uk/current-students/immigration-advice/assets/documents/Info-Sheets/ConfidentialityPolicy.pdf"),
    (2, "PDF", "Formatting and binding your thesis", "https://info.lse.ac.uk/current-students/phd-academy/assets/documents/Formatting-and-binding-your-thesis-2021-22.pdf"),
    (3, "PDF", "LSE 2030 Strategy", "https://www.lse.ac.uk/2030/assets/pdf/LSE-2030-booklet.pdf"),
    (4, "PDF", "MSc Mark-Frame", "https://www.lse.ac.uk/sociology/assets/documents/study/Assessment-and-Feedback/MSc-Mark-Frame.pdf"),
    (5, "PDF", "BSc Economics Handbook 2021/22", "https://www.lse.ac.uk/economics/Assets/Documents/undergraduate-study/bsc-handbook-21.22.pdf"),
    (6, "PDF", "UG History Department Handbook 2023/24", "https://www.lse.ac.uk/International-History/Assets/Documents/student-handbooks/2023-24/UG-Student-Handbook-Department-of-International-History-2023-24.pdf"),
    (7, "PDF", "Exam Procedure for Candidates", "https://info.lse.ac.uk/current-students/services/assets/documents/Exam-Procedures-for-Candidates.pdf"),
    (8, "PDF", "Spring Exam Timetable 2024", "https://info.lse.ac.uk/current-students/services/assets/documents/Spring-Exam-Timetable-2024-Final.pdf")

]

json_data = []

splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)

# Process each document to generate a JSON entry
for doc_id, doc_type, doc_name, description, link in documents:
    json_entry = generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter)
    if json_entry:
        json_data.append(json_entry)
    else:
        print(f"Failed to create JSON entry for {doc_name}")

# Save the JSON data to a file
json_file_path = "seed_lse_data_3.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")


ValueError: not enough values to unpack (expected 5, got 4)