# NB01 - Chunking PDFs

## Setup - Read PDFs from docs folder and perform necessary cleaning using regex

In [2]:
import PyPDF2

def read_pdf(file_path):
    # Initialize a variable to hold all the text
    all_text = ""
    
    # Open the PDF file
    with open(file_path, "rb") as file:
        # Initialize a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Iterate through each page in the PDF
        for page in pdf_reader.pages:
            # Extract text from the page
            text = page.extract_text()
            if text:
                all_text += text  # Append the extracted text to all_text

    return all_text

appeals_text = read_pdf("../docs/Appeals-Regulations.pdf")
classification_text = read_pdf("../docs/BA-BSc-Three-Year-scheme-for-students-from-2018.19.pdf")
complaints_text = read_pdf("../docs/comPro.pdf")
procedure_text = read_pdf("../docs/Exam-Procedures-for-Candidates.pdf")
finsupport_text = read_pdf("../docs/In-Course-Financial-Support.pdf")
interruption_text = read_pdf("../docs/InterruptionPolicy.pdf")
timetable_text = read_pdf("../docs/Spring-Exam-Timetable-2024-Final.pdf")
deferral_text = read_pdf("../docs/Student-Guidance-Deferral.pdf")

test_texts = [appeals_text, classification_text, complaints_text, procedure_text, finsupport_text, interruption_text, timetable_text, deferral_text] #save in a list for efficiency for now


In [5]:
import re

def remove_newlines(text):
    # Replace all newline characters with an empty string
    cleaned_text = re.sub(r'\n', '', text)
    return cleaned_text

cleaned_texts = [remove_newlines(i) for i in test_texts]

cleaned_appeal = cleaned_texts[0]

## Begin investigations into finding most appropriate embeddings

In [8]:
from embeddings import compute_text_embedding  
from openai_clients import create_openai_embed_client
import asyncio

async def main():
    # Create the OpenAI embedding client
    client, model, dimensions = await create_openai_embed_client()
    dimensions = int(dimensions) if dimensions else 1536  # Default dimensions if not set

    # Iterate over texts and compute their embeddings
    embeddings = []
    for text in cleaned_texts:
        embedding = await compute_text_embedding(
            q=text,
            openai_client=client,
            embed_model=model,
            embedding_dimensions=dimensions
        )
        embeddings.append(embedding)
    
    # Example of how to use embeddings (here we just print them)
    for i, embedding in enumerate(embeddings):
        print(f"Embedding for Text {i+1}: {embedding}")

# Run the asynchronous main function
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # Reuse the existing running loop in Jupyter Notebook
        tasks = asyncio.ensure_future(main())  # Schedule main to run
        # You may run tasks.result() in another cell to get the result if needed
    else:
        # If somehow the loop is not running, use asyncio.run (unlikely in Jupyter)
        asyncio.run(main())



Task exception was never retrieved
future: <Task finished name='Task-5' coro=<main() done, defined at /var/folders/7p/mc0d01t94ln6c7d6rr45xfnw0000gn/T/ipykernel_16713/3143247749.py:5> exception=BadRequestError('Error code: 400 - {\'error\': {\'message\': "This model\'s maximum context length is 8192 tokens, however you requested 11406 tokens (11406 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", \'type\': \'invalid_request_error\', \'param\': None, \'code\': None}}')>
Traceback (most recent call last):
  File "/var/folders/7p/mc0d01t94ln6c7d6rr45xfnw0000gn/T/ipykernel_16713/3143247749.py", line 13, in main
    embedding = await compute_text_embedding(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/akshsabherwal/Desktop/chat-lse/fastapi_app/embeddings.py", line 20, in compute_text_embedding
    embedding = await openai_client.embeddings.create(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/akshsabherwal/Desk

### So clearly, the current texts are too long... we need to try to find a way to chunk them, but in a way that they are usable.

I do not think sentence-based chunking is the best way to calculate embeddings for this context ... there will be issues with scalability and more importantly lost context. 

For this reason, I think it will be worth it to try a __sliding window method__, where we establish a "window" of a certain length upon which the embeddings will be calculated and a step size for the window. This will likely be a good idea because a majority of LSE documents do not have a uniform structure, but this method allows us to maintain a contextual link between each chunk.

Let's start with a step size of 4096 (half of 8192)

In [12]:
import nest_asyncio

nest_asyncio.apply()

def sliding_window(text, max_length=8192, step_size=4096):
    tokens = text.split()  # Simple whitespace tokenizer
    chunks = []

    for i in range(0, len(tokens), step_size):
        chunk = tokens[i:i + max_length]
        chunk_text = " ".join(chunk)
        chunks.append(chunk_text)
        if i + max_length >= len(tokens):
            break
    
    return chunks

import asyncio

async def main():
    client, model, dimensions = await create_openai_embed_client()
    dimensions = int(dimensions) if dimensions else 1536  # Default dimensions if not set
    all_embeddings = []
    for text in cleaned_texts:
        text_chunks = sliding_window(text)
        embeddings = []
        for chunk in text_chunks:
            embedding = await compute_text_embedding(
                q=chunk,
                openai_client=client,
                embed_model=model,
                embedding_dimensions=dimensions
            )
            embeddings.append(embedding)
        all_embeddings.append(embeddings)

    for i, embeddings in enumerate(all_embeddings):
        print(f"Embedding for Text {i+1}:")
        for j, embedding in enumerate(embeddings):
            print(f"  Chunk {j+1}: {embedding}")

await main()

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 9865 tokens (9865 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

Seems that there are still issues with token length even though we specified a max length of 8192 tokens earlier... this could be because of differences in how whitespace and special characters are counted as tokens by the model.

For this reason, I'll try decreasing the token and step size of the sliding window function.

In [21]:
import nest_asyncio

nest_asyncio.apply()

def sliding_window(text, max_length=6500, step_size=3000):
    tokens = text.split()  # Simple whitespace tokenizer
    chunks = []

    for i in range(0, len(tokens), step_size):
        chunk = tokens[i:i + max_length]
        chunk_text = " ".join(chunk)
        chunks.append(chunk_text)
        if i + max_length >= len(tokens):
            break
    
    return chunks

import asyncio

embeddings_store = {}

async def main():
    client, model, dimensions = await create_openai_embed_client()
    dimensions = int(dimensions) if dimensions else 1536

    text_id = 1  # A simple counter or identifier for each text
    text_chunks = sliding_window(cleaned_texts[0])
    embeddings = []
    for chunk in text_chunks:
        embedding = await compute_text_embedding(
            q=chunk,
            openai_client=client,
            embed_model=model,
            embedding_dimensions=dimensions
        )
        embeddings.append(embedding)
    embeddings_store[text_id] = embeddings
    text_id += 1

    for text_id, embeddings in embeddings_store.items():
        print(f"Embedding for Text {text_id}:")
        for i, embedding in enumerate(embeddings):
            print(f"  Chunk {i+1}: {embedding}")

await main()

embeddings_store

Embedding for Text 1:
  Chunk 1: [0.02551848627626896, 0.015385559760034084, 0.008457403630018234, -0.032792385667562485, -0.01761959120631218, 0.017433421686291695, 0.0030501838773489, -0.003171526361256838, -0.009315112605690956, -0.03712746873497963, -0.02503976598381996, -0.012958711013197899, 0.0031632152386009693, 0.00879649817943573, -0.0014178784331306815, -0.0036801674868911505, 0.0374998077750206, 0.00802522525191307, 0.0314360111951828, 0.01686161570250988, -0.021236594766378403, -0.008111661300063133, -0.02811155840754509, -0.0008759928750805557, -0.011489302851259708, 0.0037998477928340435, 0.0030501838773489, -0.01752650737762451, 0.009674153290688992, -0.021967973560094833, 0.004537875764071941, -0.01256642583757639, -0.027233904227614403, -0.0015674787573516369, -0.011894886381924152, 0.015225986018776894, -0.0037134119775146246, 0.008849688805639744, 0.03343068063259125, 0.0005597544368356466, 0.03757959604263306, 0.007101027760654688, -0.0017469990998506546, -0.001181

{1: [[0.02551848627626896,
   0.015385559760034084,
   0.008457403630018234,
   -0.032792385667562485,
   -0.01761959120631218,
   0.017433421686291695,
   0.0030501838773489,
   -0.003171526361256838,
   -0.009315112605690956,
   -0.03712746873497963,
   -0.02503976598381996,
   -0.012958711013197899,
   0.0031632152386009693,
   0.00879649817943573,
   -0.0014178784331306815,
   -0.0036801674868911505,
   0.0374998077750206,
   0.00802522525191307,
   0.0314360111951828,
   0.01686161570250988,
   -0.021236594766378403,
   -0.008111661300063133,
   -0.02811155840754509,
   -0.0008759928750805557,
   -0.011489302851259708,
   0.0037998477928340435,
   0.0030501838773489,
   -0.01752650737762451,
   0.009674153290688992,
   -0.021967973560094833,
   0.004537875764071941,
   -0.01256642583757639,
   -0.027233904227614403,
   -0.0015674787573516369,
   -0.011894886381924152,
   0.015225986018776894,
   -0.0037134119775146246,
   0.008849688805639744,
   0.03343068063259125,
   0.00055975

Looks like it worked for the "Appeals Regulation" text... let's try it for the other ones as well 

In [23]:
import nest_asyncio

nest_asyncio.apply()

def sliding_window(text, max_length=6500, step_size=3000):
    tokens = text.split()  # Simple whitespace tokenizer
    chunks = []

    for i in range(0, len(tokens), step_size):
        chunk = tokens[i:i + max_length]
        chunk_text = " ".join(chunk)
        chunks.append(chunk_text)
        if i + max_length >= len(tokens):
            break
    
    return chunks

embeddings_store = {}

async def main():
    client, model, dimensions = await create_openai_embed_client()
    dimensions = int(dimensions) if dimensions else 1536

    text_id = 1  # A simple counter or identifier for each text
    for text in cleaned_texts:
        text_chunks = sliding_window(text)
        embeddings = []
        for chunk in text_chunks:
            embedding = await compute_text_embedding(
                q=chunk,
                openai_client=client,
                embed_model=model,
                embedding_dimensions=dimensions
            )
            embeddings.append(embedding)
        embeddings_store[text_id] = embeddings
        text_id += 1

    for text_id, embeddings in embeddings_store.items():
        print(f"Embedding for Text {text_id}:")
        for i, embedding in enumerate(embeddings):
            print(f"  Chunk {i+1}: {embedding}")

await main()

embeddings_store

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 15001 tokens (15001 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

The error suggests that one of the texts still has 15001 tokens, which is strange since we specified that the limit of the window is 6500...

I have found out that actually the method I have instated for splitting based on chunking makes the crucial and incorrect assumption that one character equals one token, which is not the case for many OpenAI models... Let's use a byte-pair encoding (BPE) tokenizer instead to more accurately chunk based on the number of tokens.

In [25]:
from transformers import GPT2Tokenizer
import nest_asyncio

nest_asyncio.apply()

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def sliding_window(text, max_length=6500, step_size=3000):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    
    chunks = []
    for i in range(0, len(tokens), step_size):
        chunk = tokens[i:i + max_length]
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
        if i + max_length >= len(tokens):
            break
    
    return chunks

embeddings_store = {}

async def main():
    client, model, dimensions = await create_openai_embed_client()
    dimensions = int(dimensions) if dimensions else 1536

    text_id = 1  # A simple counter or identifier for each text
    for text in cleaned_texts:
        text_chunks = sliding_window(text)
        embeddings = []
        for chunk in text_chunks:
            embedding = await compute_text_embedding(
                q=chunk,
                openai_client=client,
                embed_model=model,
                embedding_dimensions=dimensions
            )
            embeddings.append(embedding)
        embeddings_store[text_id] = embeddings
        text_id += 1

    for text_id, embeddings in embeddings_store.items():
        print(f"Embedding for Text {text_id}:")
        for i, embedding in enumerate(embeddings):
            print(f"  Chunk {i+1}: {embedding}")

await main()

embeddings_store



Embedding for Text 1:
  Chunk 1: [0.022973159328103065, 0.019121933728456497, 0.0051092468202114105, -0.030192527920007706, -0.012700989842414856, 0.02089322917163372, 0.007400523871183395, -0.0077896723523736, -0.009386521764099598, -0.04028354212641716, -0.02161785028874874, -0.013076718896627426, 0.0008114241645671427, 0.006494748406112194, 0.002633459400385618, -0.005246790591627359, 0.03284946829080582, 0.010178236290812492, 0.03564060106873512, 0.015565925277769566, -0.018128935247659683, -0.008084887638688087, -0.026690194383263588, -0.005853995680809021, -0.014371642842888832, 0.007481037639081478, 0.005380979273468256, -0.015029169619083405, 0.009359683841466904, -0.02580454759299755, 0.003827741602435708, -0.010842472314834595, -0.024851804599165916, -0.00261500827036798, -0.014103265479207039, 0.012486287392675877, -0.006350494921207428, 0.00795069895684719, 0.03738505765795708, -0.0027542293537408113, 0.040873974561691284, 0.005847285967320204, -0.0029102242551743984, -0.00

{1: [[0.022973159328103065,
   0.019121933728456497,
   0.0051092468202114105,
   -0.030192527920007706,
   -0.012700989842414856,
   0.02089322917163372,
   0.007400523871183395,
   -0.0077896723523736,
   -0.009386521764099598,
   -0.04028354212641716,
   -0.02161785028874874,
   -0.013076718896627426,
   0.0008114241645671427,
   0.006494748406112194,
   0.002633459400385618,
   -0.005246790591627359,
   0.03284946829080582,
   0.010178236290812492,
   0.03564060106873512,
   0.015565925277769566,
   -0.018128935247659683,
   -0.008084887638688087,
   -0.026690194383263588,
   -0.005853995680809021,
   -0.014371642842888832,
   0.007481037639081478,
   0.005380979273468256,
   -0.015029169619083405,
   0.009359683841466904,
   -0.02580454759299755,
   0.003827741602435708,
   -0.010842472314834595,
   -0.024851804599165916,
   -0.00261500827036798,
   -0.014103265479207039,
   0.012486287392675877,
   -0.006350494921207428,
   0.00795069895684719,
   0.03738505765795708,
   -0.00275

Seems like it worked! This is method 1: splitting using sliding windows.