# NB02- Explore SentenceSplitter

From NB01, we determined that SentenceSplitter from ollama-index's documentation is an effective way of chunking for our purposes. This notebook will attempt to implement it using the docs we have collected via crawling and then test them on the ChatUI.

## Setup: Initialise SentenceSplitter() arguments and set up embedding LLM 

In [2]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Assuming constants and imports are defined elsewhere
DEFAULT_CHUNK_SIZE = 500  # This means each chunk has at most 500 tokens
SENTENCE_CHUNK_OVERLAP = 50  # Example overlap
CHUNKING_REGEX = r"[^,\.;]+[,\.;]?"  # Simple sentence splitter regex
DEFAULT_PARAGRAPH_SEP = "\n\n"  # Paragraph separator


embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") 

embeddings = embed_model.get_text_embedding("Hello World!") 
print(len(embeddings))
print(embeddings)





1024
[-0.015347783453762531, 0.032198358327150345, 0.004012150689959526, -0.007076343521475792, -0.03421807661652565, 0.010364635847508907, 0.005171587225049734, 0.035093992948532104, 0.0225681122392416, 0.015681663528084755, 0.020849039778113365, -0.0038733554538339376, 0.019704800099134445, -0.00847347266972065, -0.0241085235029459, 0.024853482842445374, -0.01752498745918274, -0.0347379632294178, -0.00883396714925766, 0.004576738923788071, -0.029032930731773376, 0.005700716748833656, -0.091826431453228, -0.05334915220737457, -0.01873025856912136, 0.05672815442085266, 0.041659094393253326, -0.014728585258126259, 0.05186482146382332, 0.07056008279323578, -0.032065510749816895, -0.026547519490122795, 0.022859014570713043, -0.04784966632723808, -0.003298336174339056, -0.011454702354967594, 0.05072872340679169, -0.048405423760414124, -0.014219076372683048, -0.053499486297369, 0.022067135199904442, 0.00993543490767479, 0.025218771770596504, -0.034378718584775925, -0.051167313009500504, 0.0

## Load and process LSE docs

In [11]:
import PyPDF2
import re


def read_pdf(file_path):
    # Initialize a variable to hold all the text
    all_text = ""
    
    # Open the PDF file
    with open(file_path, "rb") as file:
        # Initialize a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Iterate through each page in the PDF
        for page in pdf_reader.pages:
            # Extract text from the page
            text = page.extract_text()
            if text:
                all_text += text  # Append the extracted text to all_text

    return all_text


file_paths = ["../docs/ConfidentialityPolicy.pdf", "../docs/Formatting-and-binding-your-thesis-2021-22.pdf", "../docs/LSE-2030-booklet.pdf", "../docs/MSc-Mark-Frame.pdf", "../docs/bsc-handbook-21.22.pdf", "../docs/UG-Student-Handbook-Department-of-International-History-2023-24 (1).pdf", "../docs/Exam-Procedures-for-Candidates.pdf", "../docs/Spring-Exam-Timetable-2024-Final.pdf"]

test_texts = [read_pdf(i) for i in file_paths]

def clean_text(text):
    # Replace all newline characters with an empty string
    cleaned_text = re.sub(r'\n', ' ', text)
    # Replace two or more spaces with a single space
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    # Replace a space followed by a period with just a period
    cleaned_text = re.sub(r' \.', '.', cleaned_text)
    # Replace a space followed by a comma with just a comma
    cleaned_text = re.sub(r' ,', ',', cleaned_text)
    return cleaned_text

cleaned_texts = [clean_text(i) for i in test_texts]

## Use SentenceSplitter()

In [4]:
import json
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large")

# Define a helper function to generate chunk entries
def generate_chunk_entry(doc_type, chunk_name, chunk_description, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk_description)
        return {
            "Type": doc_type,
            "Name": chunk_name,
            "Description": chunk_description,
            "Embedding": embedding  # Ensure the embedding is serializable
            
        }
    except Exception as e:
        print(f"Error computing embedding for chunk {chunk_name}: {e}")
        return None

# Define the function to generate a JSON entry for each document
def generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter):
    try:
        # Split the description into chunks
        sentence_chunks = splitter.split_text(description)
        chunks = []
        for i, chunk in enumerate(sentence_chunks, 1):
            chunk_entry = generate_chunk_entry(doc_type, f"{doc_name} - Part {i}", chunk, embed_model)
            if chunk_entry:
                chunks.append(chunk_entry)

        return {
            "Id": doc_id,
            "Name": doc_name,
            "Description": description,
            "Link": link,
            "Chunks": chunks
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None

# Assuming 'cleaned_texts' and 'SentenceSplitter' are defined elsewhere in your code
documents = [
    (1, "PDF", "Immigration Advice Confidentiality Policy", cleaned_texts[0], "https://info.lse.ac.uk/current-students/immigration-advice/assets/documents/Info-Sheets/ConfidentialityPolicy.pdf"),
    (2, "PDF", "Formatting and binding your thesis", cleaned_texts[1], "https://info.lse.ac.uk/current-students/phd-academy/assets/documents/Formatting-and-binding-your-thesis-2021-22.pdf"),
    (3, "PDF", "LSE 2030 Strategy", cleaned_texts[2], "https://www.lse.ac.uk/2030/assets/pdf/LSE-2030-booklet.pdf"),
    (4, "PDF", "MSc Mark-Frame", cleaned_texts[3], "https://www.lse.ac.uk/sociology/assets/documents/study/Assessment-and-Feedback/MSc-Mark-Frame.pdf"),
    (5, "PDF", "BSc Economics Handbook 2021/22", cleaned_texts[4], "https://www.lse.ac.uk/economics/Assets/Documents/undergraduate-study/bsc-handbook-21.22.pdf"),
    (6, "PDF", "UG History Department Handbook 2023/24", cleaned_texts[5], "https://www.lse.ac.uk/International-History/Assets/Documents/student-handbooks/2023-24/UG-Student-Handbook-Department-of-International-History-2023-24.pdf"),
    (7, "PDF", "Exam Procedure for Candidates", cleaned_texts[6], "https://info.lse.ac.uk/current-students/services/assets/documents/Exam-Procedures-for-Candidates.pdf"),
    (8, "PDF", "Spring Exam Timetable 2024", cleaned_texts[7], "https://info.lse.ac.uk/current-students/services/assets/documents/Spring-Exam-Timetable-2024-Final.pdf")
    
]

json_data = []

splitter = SentenceSplitter(chunk_size=512, chunk_overlap=256)

# Process each document to generate a JSON entry
for doc_id, doc_type, doc_name, description, link in documents:
    json_entry = generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter)
    if json_entry:
        json_data.append(json_entry)
    else:
        print(f"Failed to create JSON entry for {doc_name}")

# Save the JSON data to a file
json_file_path = "seed_lse_data_256.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")


JSON file created successfully at seed_lse_data_256.json


I found that with the above method, it splits chunks based on sentences as well... this means that when a sentence ends, it stops the chunk, which isn't exactly desirable. Let's try to find a way to work around it.

I have found out that the issue is to do with the fact that we are enumerating based on pages, and in doing so we split based on pages. Let's just get rid of this.

## Use recursive breakdown

In [5]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
print(list(doc.sents))

import tqdm
import spacy

# Load the spaCy language model

# Initialize a dictionary to store results
results = {}

# Process each text in the list with its index
for index, text in enumerate(tqdm.tqdm(cleaned_texts)):
    # Analyze the text with spaCy to get sentences
    doc = nlp(text)
    sentences = list(doc.sents)
    
    # Convert all Sentence objects to strings
    sentences = [str(sentence) for sentence in sentences]
    
    # Use the index as the key for each document's results
    results[f"document_{index}"] = {
        "sentences": sentences,
        "sentence_count": len(sentences)
    }

results

[This is a sentence., This another sentence.]


100%|██████████| 8/8 [00:00<00:00, 25.40it/s]

{'document_0': {'sentence_count': 40,
                'sentences': ['3.',
                              'where we are required to do so by law - this '
                              'includes any situation which may impact on the '
                              'LSE Student license or Skilled Worker license ; '
                              '4.',
                              'when we believe you or someone else may be in '
                              'danger; 5.',
                              'when the Home Office contact us regarding your '
                              'immigration sta tus/verification for the '
                              'Graduate Route.',
                              'We will always seek your consent or inform you '
                              'if we were required to speak to the Home '
                              'Office.',
                              'There may be circumstances where we may need to '
                              'raise your case as




{'document_0': {'sentences': ['3.',
   'where we are required to do so by law - this includes any situation which may impact on the LSE Student license or Skilled Worker license ; 4.',
   'when we believe you or someone else may be in danger; 5.',
   'when the Home Office contact us regarding your immigration sta tus/verification for the Graduate Route.',
   'We will always seek your consent or inform you if we were required to speak to the Home Office.',
   'There may be circumstances where we may need to raise your case as a Student sponsor and for safety purposes.',
   'If you are contacting us via our Live Chat facility, we will assume consent has been given.',
   'Liaison and correspondence We may need to communicate with someone outside our team – for example staff in your academic department.',
   'Before we do this, w e will agree what information we will share and will offer you the chance to approve any written information or communication before we send it, if this is necess

In [7]:
import tqdm as tqdm

# Create a list of dictionaries from cleaned texts to ensure compatability with the method prescribed in the Google colab notebook 
dict_cleaned_texts = [{'text': text} for text in cleaned_texts]

for item in tqdm.tqdm(dict_cleaned_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

# Define split size to turn groups of sentences into chunks

num_sentence_chunk_size = 3

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm.tqdm(dict_cleaned_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 8/8 [00:00<00:00, 34.67it/s]
100%|██████████| 8/8 [00:00<00:00, 47060.91it/s]


In [9]:
for item in dict_cleaned_texts:
    # Convert each list of sentences in sentence_chunks to a single concatenated string
    item["sentence_chunks"] = [' '.join(chunk) for chunk in item["sentence_chunks"]]

print(dict_cleaned_texts[0]["sentence_chunks"])

['3. where we are required to do so by law - this includes any situation which may impact on the LSE Student license or Skilled Worker license ; 4. when we believe you or someone else may be in danger; 5.', 'when the Home Office contact us regarding your immigration sta tus/verification for the Graduate Route. We will always seek your consent or inform you if we were required to speak to the Home Office. There may be circumstances where we may need to raise your case as a Student sponsor and for safety purposes.', 'If you are contacting us via our Live Chat facility, we will assume consent has been given. Liaison and correspondence We may need to communicate with someone outside our team – for example staff in your academic department. Before we do this, w e will agree what information we will share and will offer you the chance to approve any written information or communication before we send it, if this is necessary.', 'Consultation Our team will discuss your case together. If we ne

In [14]:
import json
from spacy.lang.en import English
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import tqdm

# Load the spaCy English model
nlp = English()
nlp.add_pipe("sentencizer")

# Sample data, assuming 'cleaned_texts' is already defined
documents = [
    (1, "PDF", "Immigration Advice Confidentiality Policy", cleaned_texts[0], "https://info.lse.ac.uk/current-students/immigration-advice/assets/documents/Info-Sheets/ConfidentialityPolicy.pdf"),
    (2, "PDF", "Formatting and binding your thesis", cleaned_texts[1], "https://info.lse.ac.uk/current-students/phd-academy/assets/documents/Formatting-and-binding-your-thesis-2021-22.pdf"),
    (3, "PDF", "LSE 2030 Strategy", cleaned_texts[2], "https://www.lse.ac.uk/2030/assets/pdf/LSE-2030-booklet.pdf"),
    (4, "PDF", "MSc Mark-Frame", cleaned_texts[3], "https://www.lse.ac.uk/sociology/assets/documents/study/Assessment-and-Feedback/MSc-Mark-Frame.pdf"),
    (5, "PDF", "BSc Economics Handbook 2021/22", cleaned_texts[4], "https://www.lse.ac.uk/economics/Assets/Documents/undergraduate-study/bsc-handbook-21.22.pdf"),
    (6, "PDF", "UG History Department Handbook 2023/24", cleaned_texts[5], "https://www.lse.ac.uk/International-History/Assets/Documents/student-handbooks/2023-24/UG-Student-Handbook-Department-of-International-History-2023-24.pdf"),
    (7, "PDF", "Exam Procedure for Candidates", cleaned_texts[6], "https://info.lse.ac.uk/current-students/services/assets/documents/Exam-Procedures-for-Candidates.pdf"),
    (8, "PDF", "Spring Exam Timetable 2024", cleaned_texts[7], "https://info.lse.ac.uk/current-students/services/assets/documents/Spring-Exam-Timetable-2024-Final.pdf")
    
]

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large")

# Define a function to generate chunk entries with embeddings
def generate_chunk_entry(chunk, doc_id, doc_name, idx, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk)
        return {
            "Type": "PDF",
            "Name": f"{doc_name} - Part {idx}",
            "Description": chunk,
            "Embedding": embedding
        }
    except Exception as e:
        print(f"Error computing embedding for chunk: {e}")
        return None

# Function to split text into chunks of three sentences
def split_into_chunks(text, nlp_model, chunk_size=3):
    doc = nlp_model(text)
    sentences = [str(sentence) for sentence in doc.sents]
    chunks = [' '.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

# Process each document
json_data = []
for document in documents:
    doc_id, doc_type, doc_name, description, link = document  # Unpack the tuple correctly
    doc_chunks = []
    sentence_chunks = split_into_chunks(description, nlp)
    for idx, chunk in enumerate(sentence_chunks, start=1):
        chunk_entry = generate_chunk_entry(chunk, doc_id, doc_name, idx, embed_model)
        if chunk_entry:
            doc_chunks.append(chunk_entry)
    
    document_entry = {
        "Id": doc_id,
        "Name": doc_name,
        "Description": description,
        "Link": link,
        "Chunks": doc_chunks
    }
    json_data.append(document_entry)

# Optionally, save the JSON data to a file
json_file_path = "formatted_json_data.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")




JSON file created successfully at formatted_json_data.json
