<a href="https://colab.research.google.com/github/LSenai/NLP-tutor-LLM/blob/main/Tutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install PyMuPDF
!pip install nltk
!pip install  faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
import re
import pandas as pd
import fitz # PyMuPDF
import pprint
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import torch
import numpy as np
import faiss

from sentence_transformers import CrossEncoder

import nltk
nltk.download('punkt_tab')

from transformers import AutoTokenizer, AutoModel

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
pdf_path = "The_Tell_Tale_Heart.pdf"

In [None]:
def open_and_read_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    pages_and_texts = []

    for page_num, page in tqdm(enumerate(doc), total=len(doc)):
        text = page.get_text()
        pages_and_texts.append({
            "page_number": page_num,
            "text": text
        })
    return pages_and_texts

def clean_text(text):
    """
    Cleans a paragraph by:
    1. Removing numbers surrounded by spaces or newlines.
    2. Replacing " \\n" with a single space.
    """
    # Remove numbers surrounded by spaces or newlines
    text = re.sub(r"(\s*\n*\s*\d+\s*\n*\s*)", " ", text)

    # Replace " \n" with a single space
    text = re.sub(r" \n", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:

def split_into_paragraphs(pages_and_texts):

    paragraph_delimiter = r"(?:\s*\n\s*\n\s*|\s{2,}\n)"

    # Combine text from all pages
    combined_text = ""
    page_boundaries = []
    for page_data in pages_and_texts:
        start_idx = len(combined_text)  # Track starting index for this page
        combined_text += page_data["text"]
        page_boundaries.append((start_idx, len(combined_text), page_data["page_number"]))

    # Split combined text into paragraphs
    paragraphs = re.split(paragraph_delimiter, combined_text)

    # Map paragraphs back to pages
    paragraph_data = []
    for paragraph in paragraphs:

        cleaned_paragraph = clean_text(paragraph)

        if not cleaned_paragraph:  # Skip empty paragraphs after cleaning
            continue

        if len(paragraph.split(" ")) < 20:
            continue

        # Find the pages the paragraph corresponds to
        paragraph_start_idx = combined_text.find(paragraph)
        paragraph_end_idx = paragraph_start_idx + len(paragraph)
        pages_spanned = set()

        for start, end, page_number in page_boundaries:
            if paragraph_start_idx < end and paragraph_end_idx > start:
                pages_spanned.add(page_number)

        paragraph_data.append({
            "page_number": sorted(pages_spanned),
            "char_count": len(cleaned_paragraph),
            "word_count": len(cleaned_paragraph.split(" ")),
            "sentence_count": len(sent_tokenize(cleaned_paragraph)),
            "text": cleaned_paragraph
        })

    return pd.DataFrame(paragraph_data)

def split_into_chunks(pages_and_texts, chunk_size):

    # Combine text from all pages
    combined_text = ""
    page_boundaries = []
    for page_data in pages_and_texts:
        start_idx = len(combined_text)  # Track starting index for this page
        combined_text += page_data["text"]
        page_boundaries.append((start_idx, len(combined_text), page_data["page_number"]))

    chunks = [combined_text[i:i + chunk_size] for i in range(0, len(combined_text), chunk_size)]

    chunk_data = []
    for chunk in chunks:

        cleaned_chunk = clean_text(chunk)

        if not cleaned_chunk:  # Skip empty paragraphs after cleaning
            continue

        # Find the pages the paragraph corresponds to
        paragraph_start_idx = combined_text.find(chunk)
        paragraph_end_idx = paragraph_start_idx + len(chunk)
        pages_spanned = set()

        for start, end, page_number in page_boundaries:
            if paragraph_start_idx < end and paragraph_end_idx > start:
                pages_spanned.add(page_number)

        chunk_data.append({
            "page_number": sorted(pages_spanned),
            "char_count": len(cleaned_chunk),
            "word_count": len(cleaned_chunk.split(" ")),
            "sentence_count": len(sent_tokenize(cleaned_chunk)),
            "text": cleaned_chunk
        })

    return pd.DataFrame(chunk_data)

def split_into_sentences(pages_and_texts, num_sentences = 10):

    combined_text = ""
    page_boundaries = []
    for page_data in pages_and_texts:
        start_idx = len(combined_text)  # Track starting index for this page
        combined_text += page_data["text"]
        page_boundaries.append((start_idx, len(combined_text), page_data["page_number"]))

    sentence_boundary_pattern = r'(?<=[.!?])(?=\s|\n)'

    # Split text into sentences
    sentences = re.split(sentence_boundary_pattern, combined_text)
    print(sentences)

    # Group sentences into chunks of `num_sentences`
    chunks = ["".join(sentences[i:i + num_sentences]) for i in range(0, len(sentences), num_sentences)]

    chunk_data = []
    for chunk in chunks:

        cleaned_chunk = clean_text(chunk)

        if not cleaned_chunk:  # Skip empty chunks after cleaning
            continue

        # Find the pages the chunk corresponds to
        chunk_start_idx = combined_text.find(chunk)
        chunk_end_idx = chunk_start_idx + len(chunk)
        pages_spanned = set()

        for start, end, page_number in page_boundaries:
            if chunk_start_idx < end and chunk_end_idx > start:
                pages_spanned.add(page_number)

        chunk_data.append({
            "page_number": sorted(pages_spanned),
            "char_count": len(cleaned_chunk),
            "word_count": len(cleaned_chunk.split(" ")),
            "sentence_count": len(sent_tokenize(cleaned_chunk)),
            "text": cleaned_chunk
        })

    return pd.DataFrame(chunk_data)

def split_into_pages(pages_and_texts):

    pages_data = []

    for page_data in pages_and_texts:

        cleaned_page = clean_text(page_data["text"])

        pages_data.append({
            "page_number": page_data["page_number"],
            "char_count": len(cleaned_page),
            "word_count": len(cleaned_page.split(" ")),
            "sentence_count": len(sent_tokenize(cleaned_page)),
            "text": cleaned_page
        })

    return pd.DataFrame(pages_data)

In [None]:
def create_df_from_pdf(pdf_path, method="paragraph", fixed_size = 512, num_sentences = 10):

    pages_and_texts = open_and_read_pdf(pdf_path)

    if method == "paragraph":
        df = split_into_paragraphs(pages_and_texts)

    if method == "fixed":
        df = split_into_chunks(pages_and_texts, fixed_size)

    if method =="sentence":
        df = split_into_sentences(pages_and_texts, num_sentences)

    if method == "page":
        df = split_into_pages(pages_and_texts)

    return df

In [None]:
df=create_df_from_pdf(pdf_path, method="sentence")
df

100%|██████████| 6/6 [00:00<00:00, 165.13it/s]

['3 \n \n \n \nTRUE!—nervous—very, very dreadfully nervous I had been \nand am; but why will you say that I am mad?', ' The disease \nhad sharpened my senses—not destroyed—not dulled them.', ' \nAbove all was the sense of hearing acute.', ' I heard all things in \nthe heaven and in the earth.', ' I heard many things in hell.', ' \nHow, then, am I mad?', ' Hearken!', ' and observe how \nhealthily—how calmly I can tell you the whole story.', '  \nIt is impossible to say how first the idea entered my \nbrain; but once conceived, it haunted me day and night.', ' \nObject there was none.', ' Passion there was none.', ' I loved the \nold man.', ' He had never wronged me.', ' He had never given me \ninsult.', ' For his gold I had no desire.', ' I think it was his eye!', ' \nyes, it was this!', ' One of his eyes resembled that of a \nvulture—a pale blue eye, with a film over it.', ' Whenever it fell \nupon me, my blood ran cold; and so by degrees—very \ngradually—I made up my mind to take the 




Unnamed: 0,page_number,char_count,word_count,sentence_count,text
0,[0],519,95,10,"TRUE!—nervous—very, very dreadfully nervous I ..."
1,[0],446,94,10,Passion there was none. I loved the old man. H...
2,"[0, 1]",812,158,10,You fancy me mad. Madmen know nothing. But you...
3,[1],1158,220,10,Ha!—would a madman have been so wise as this? ...
4,"[1, 2]",1220,239,10,I fairly chuckled at the idea; and perhaps he ...
5,[2],1197,221,10,I say I knew it well. I knew what the old man ...
6,"[2, 3]",863,165,10,"It was open—wide, wide open—and I grew furious..."
7,[3],618,114,10,Meantime the hellish tattoo of the heart incre...
8,[3],473,94,10,"The old man’s hour had come! With a loud yell,..."
9,"[3, 4]",563,109,10,"Yes, he was stone, stone dead. I placed my han..."


In [None]:
def get_text_embedding(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()[0]

In [None]:
# Load Hugging Face models
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

Embed text from each row:

In [None]:
embeddings = []

for _, item in df.iterrows():
    embeddings.append(get_text_embedding(embedding_model, embedding_tokenizer, item["text"]))

df["embedding"] = embeddings
df

Unnamed: 0,page_number,char_count,word_count,sentence_count,text,embedding
0,[0],519,95,10,"TRUE!—nervous—very, very dreadfully nervous I ...","[0.03869353, 0.0869804, -0.024715364, 0.073235..."
1,[0],446,94,10,Passion there was none. I loved the old man. H...,"[0.09306466, -0.01770137, 0.008837951, 0.03069..."
2,"[0, 1]",812,158,10,You fancy me mad. Madmen know nothing. But you...,"[0.12833172, -0.0884975, -0.035736557, 0.00976..."
3,[1],1158,220,10,Ha!—would a madman have been so wise as this? ...,"[0.05527917, -0.012881552, -0.048040226, 0.060..."
4,"[1, 2]",1220,239,10,I fairly chuckled at the idea; and perhaps he ...,"[0.11993993, -0.015593344, -0.028489904, -0.00..."
5,[2],1197,221,10,I say I knew it well. I knew what the old man ...,"[0.053138774, -0.04471797, -0.020848596, 0.015..."
6,"[2, 3]",863,165,10,"It was open—wide, wide open—and I grew furious...","[0.056142814, -0.12968644, 0.0023764437, 0.044..."
7,[3],618,114,10,Meantime the hellish tattoo of the heart incre...,"[0.01991876, -0.08523013, -0.042827703, 0.0011..."
8,[3],473,94,10,"The old man’s hour had come! With a loud yell,...","[0.10593571, 0.020763895, -0.0058294507, 0.029..."
9,"[3, 4]",563,109,10,"Yes, he was stone, stone dead. I placed my han...","[0.13645296, 0.086432785, -0.07851742, 0.13517..."


Get embeddings and put in tensor:

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

embeddings = torch.tensor(np.array(df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([16, 768])

Some observations:
- First attempted with all-MiniLM-L6-v2 (384max tokens) for paragraphs; then with all-mpnet-base-v2 (512max tokens) and this one returned the better (correct) result. This is for paragraph-based.

- Different books will need different cleaning strategies - should come up with cleaning strategies that generalize to most books. Also need to clean less than n words/characters in a paragraph because it might match the query exactly - like a title.

- While putting in a drivers manual, there are a lot of questions in the book. It also calculates scores of these questions. Need to avoid this. How? Keep generating until the sentence does not end with a question mark?
Question "Who has the right of way in an intersection" yields different results, the reason is that the whole page has examples, and there are different cases. How do we handle this? Give the entire page to the embedding? Maybe do page-lvl embedding with an embedder with higher max tokens??
Could find the most probable location, then give the entire page? Issue would be large context.

- I like this one the most: find highest n scores, then take 1,2,3 paragraphs before and 1,2,3 after. Then compute the score again and pick from new scores, then expand context and feed larger into the model!



Similarity using FAISS index gives different results than cosine for the question "Who did I love" and same to the question "Who came to the house". From what I've seen, FAISS uses dot product/Euclidean distance, hence why the results are different. We could make FAISS do cosine by normalizing vectors to unit length and using FAISS's inner product mode. (if we wanna stick to cosine). - We could show how both work.

In [None]:
def retrieve_top_k_similar(query, embeddings, embedding_model, embedding_tokenizer, top_k=5, method="faiss"):

  query_embedding = get_text_embedding(embedding_model, embedding_tokenizer, query)

  if method == "cosine":
    query_embedding = torch.tensor(query_embedding)

    similarity = torch.nn.functional.cosine_similarity(embeddings, query_embedding, dim=-1)
    similarity_top_k = torch.topk(similarity, k=top_k)

    return np.array(similarity_top_k.values), np.array(similarity_top_k.indices)

  # FAISS uses Euclidean distance
  elif method == "faiss":
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)

    query_embedding_reshaped = query_embedding.reshape(1, -1)

    D, I = index.search(query_embedding_reshaped, k=top_k)

    return D, I

Query:

In [None]:
query = "Who came to the house"

Retrieval of top k chunks:

In [None]:
values, indices = retrieve_top_k_similar(query, embeddings, embedding_model, embedding_tokenizer, 5, method="cosine")

In [None]:
indices

array([10,  8,  3,  5, 11])

In [None]:
pprint.pp(df["text"][10])

('I then replaced the boards so cleverly, so cunningly, that no human eye—not '
 'even his—could have detected any thing wrong. There was nothing to wash '
 'out—no stain of any kind—no blood-spot whatever. I had been too wary for '
 'that. A tub had caught all—ha! ha! When I had made an end of these labors, '
 'it was four o’clock—still dark as midnight. As the bell sounded the hour, '
 'there came a knocking at the street door. I went down to open it with a '
 'light heart,—for what had I now to fear? There entered three men, who '
 'introduced themselves, with perfect suavity, as officers of the police. A '
 'shriek had been heard by a neighbor during the night; suspicion of foul play '
 'had been aroused; information had been lodged at the police office, and they '
 '(the officers) had been deputed to search the premises.')


RERANK IF WE DECIDE TO USE IT (COULD BE VERY USEFUL):

Example:

In [None]:
docs = df["text"][indices].to_list()
docs

['I then replaced the boards so cleverly, so cunningly, that no human eye—not even his—could have detected any thing wrong. There was nothing to wash out—no stain of any kind—no blood-spot whatever. I had been too wary for that. A tub had caught all—ha! ha! When I had made an end of these labors, it was four o’clock—still dark as midnight. As the bell sounded the hour, there came a knocking at the street door. I went down to open it with a light heart,—for what had I now to fear? There entered three men, who introduced themselves, with perfect suavity, as officers of the police. A shriek had been heard by a neighbor during the night; suspicion of foul play had been aroused; information had been lodged at the police office, and they (the officers) had been deputed to search the premises.',
 'The old man’s hour had come! With a loud yell, I threw open the lantern and leaped into the room. He shrieked once—once only. In an instant I dragged him to the floor, and pulled the heavy bed over 

In [None]:
rerank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1")

model.safetensors:   6%|6         | 52.4M/870M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

In [None]:
results = rerank_model.rank(query, docs, return_documents=True, top_k=5)
results

[{'corpus_id': 0,
  'score': 0.519382,
  'text': 'I then replaced the boards so cleverly, so cunningly, that no human eye—not even his—could have detected any thing wrong. There was nothing to wash out—no stain of any kind—no blood-spot whatever. I had been too wary for that. A tub had caught all—ha! ha! When I had made an end of these labors, it was four o’clock—still dark as midnight. As the bell sounded the hour, there came a knocking at the street door. I went down to open it with a light heart,—for what had I now to fear? There entered three men, who introduced themselves, with perfect suavity, as officers of the police. A shriek had been heard by a neighbor during the night; suspicion of foul play had been aroused; information had been lodged at the police office, and they (the officers) had been deputed to search the premises.'},
 {'corpus_id': 4,
  'score': 0.15250635,
  'text': 'I smiled,—for what had I to fear? I bade the gentlemen welcome. The shriek, I said, was my own in a