<a href="https://colab.research.google.com/github/LuckyBoy587/SchoolaNova/blob/main/BERT_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install pdfplumber
!pip install faiss-cpu
!pip install nltk

from sentence_transformers import SentenceTransformer, util

# 1. Load embedding model (pretrained on similarity tasks)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
import pdfplumber, re
from nltk.tokenize import sent_tokenize

def clean_text(text: str) -> str:
    """Aggressively clean NCERT/School-book style PDF text with OCR noise."""
    # Normalize spaces/newlines
    text = re.sub(r'\s+', ' ', text)

    # Remove OCR garbled "CChhaapptteerr" like strings
    text = re.sub(r'C+H*A+P+T+E+R+.*?\d+', '', text, flags=re.IGNORECASE)

    # Remove lines with .indd and timestamps (OCR file tags)
    text = re.sub(r'\.?i+n+d+d+\s*\d+.*?(AM|PM)?', '', text, flags=re.IGNORECASE)

    # Remove publisher footer lines
    text = re.sub(r'Curiosity.*?(Grade|Gr\.a\.d\.e)', '', text, flags=re.IGNORECASE)

    # Remove "Chapter ..." repeated headers (even broken ones)
    text = re.sub(r'Chapter\s+The.*?Solutions', '', text, flags=re.IGNORECASE)

    # Define the core tokens once
    fig_core = r'(?:Fig\.?|Figure)\s*:?\s*\d+(?:[.\-\u2013]\d+)*(?:[A-Za-z])?(?:\([A-Za-z]\))?'
    tab_core = r'(?:Tab\.?|Table)\s*:?\s*\d+(?:[.\-\u2013]\d+)*(?:[A-Za-z])?(?:\([A-Za-z]\))?'

    # 1) Remove unbracketed inline refs like "Fig. 9.10a" or "Table 4.1,"
    text = re.sub(rf'\b(?:{fig_core}|{tab_core})(?:\s*[:.,;])?', '', text, flags=re.IGNORECASE)

    # 2) Remove bracketed inline refs like "(Fig. 9.10a)" or "(Figure 3(b))"
    text = re.sub(rf'\(\s*(?:{fig_core}|{tab_core})\s*\)(?:\s*[:.,;])?', '', text, flags=re.IGNORECASE)

    # 3) Remove full caption lines starting with these tokens (with or without brackets)
    text = re.sub(rf'(?mi)^\s*(?:\(\s*)?(?:{fig_core}|{tab_core})(?:\s*\))?\s+.*$', '', text, flags=re.IGNORECASE)


    # 1) Dates like 12/10/2021 or 12-10-21
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)

    # 2) Time like 12:30 or 12:30:45
    text = re.sub(r'\b\d{1,2}:\d{2}(?::\d{2})?\b', '', text)

    # 3) Weird ::0066::3366 blobs (keep the colons, drop number blobs between them)
    text = re.sub(r'::\d+::', '::', text)

    # 4) Standalone page-like counters with lots of slashes (//2288// style)
    text = re.sub(r'/{2,}\d+/{2,}', ' ', text)

    # 5) Hyphenated index codes like 2020-001-33 at line edges (optional, be careful)
    text = re.sub(r'\b\d{4}-\d{1,3}-\d{1,3}\b', '', text)


    # Collapse multiple punctuation (.... → . , ??? → ? , !!! → !)
    text = re.sub(r'([.?!])\1+', r'\1', text)

    text = re.sub(r"\(\)", "", text)

    # Remove bullets/list markers
    text = re.sub(r'[\\•\·\●\-\–\—\»\"z]', '', text)

    # Normalize ligatures
    text = text.replace('ﬁ', 'fi').replace('ﬂ', 'fl')

    return text.strip()


def pdf_to_sentences(pdf_path: str):
    """Extract sentences from PDF and clean them."""
    all_sentences = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            raw_text = page.extract_text()
            if not raw_text:
                continue
            # Clean and split into sentences
            cleaned = clean_text(raw_text)
            sentences = sent_tokenize(cleaned)
            for s in sentences:
                s = s.strip()
                if len(s.split()) > 5:  # skip very tiny fragments
                    all_sentences.append(s)
    return all_sentences

In [17]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [53]:
ch9_sentences = pdf_to_sentences("/content/drive/MyDrive/ColabContent/ch9.pdf")

In [54]:
import faiss
import numpy as np

ch9_embeddings = model.encode(ch9_sentences, convert_to_tensor=False) # Convert to numpy array for FAISS

# create a FAISS index
d = ch9_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(ch9_embeddings)

print(f"Created FAISS index with {index.ntotal} vectors.")

Created FAISS index with 293 vectors.


In [39]:
my_question = "If you have a block of iron and a piece of wood that are the same size, which one is denser? Why?"
my_question_embd = model.encode(my_question, convert_to_tensor=True)

In [57]:
import torch

# Encode the question and convert to numpy
my_question_embd_np = model.encode([my_question], convert_to_tensor=False)

# Search the FAISS index
top_k = 3
distances, indices = index.search(my_question_embd_np, top_k)

print("Question:", my_question)
print(f"Top {top_k} Matches:")
for i in range(top_k):
    print(f"- {ch9_sentences[indices[0][i]]}")

Question: If you have a block of iron and a piece of wood that are the same size, which one is denser? Why?
Top 3 Matches:
- A block of iron has a mass of 600 g and a density of 7.9 g/cm³.
- When we say that iron while others sink in water is heavier than wood, we are referring to a special property known as density, which describes the heaviness of an object.
- A wooden stick and an iron rod may be of the same sie,  Some objects float yet the iron rod feels much heavier.
