<a href="https://colab.research.google.com/github/LuckyBoy587/SchoolaNova/blob/main/BERT_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pdfplumber
!pip install faiss-cpu
!pip install nltk

from sentence_transformers import SentenceTransformer, util

# 1. Load embedding model (pretrained on similarity tasks)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


# 2. Sample NCERT-like paragraphs
docs = [
    "Photosynthesis is the process by which green plants make food using sunlight, carbon dioxide, and water.",
    "Respiration is the process by which living organisms release energy from food molecules.",
    "Chlorophyll is a green pigment in plants that helps capture sunlight for photosynthesis."
]

# 3. Encode the paragraphs
doc_embeddings = model.encode(docs, convert_to_tensor=True)

# 4. Encode a sample question
question = "How do plants cook themselves?"
query_embedding = model.encode(question, convert_to_tensor=True)

# 5. Compute similarity
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)
print(cosine_scores)

# 6. Retrieve best paragraph
best_idx = cosine_scores.argmax()
print("Question:", question)
print("Best Match:", docs[best_idx])



Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
import pdfplumber, re
from nltk.tokenize import sent_tokenize

def clean_text(text: str) -> str:
    """Aggressively clean NCERT/School-book style PDF text with OCR noise."""
    # Normalize spaces/newlines
    text = re.sub(r'\s+', ' ', text)

    # Remove OCR garbled "CChhaapptteerr" like strings
    text = re.sub(r'C+H*A+P+T+E+R+.*?\d+', '', text, flags=re.IGNORECASE)

    # Remove lines with .indd and timestamps (OCR file tags)
    text = re.sub(r'\.?i+n+d+d+\s*\d+.*?(AM|PM)?', '', text, flags=re.IGNORECASE)

    # Remove publisher footer lines
    text = re.sub(r'Curiosity.*?(Grade|Gr\.a\.d\.e)', '', text, flags=re.IGNORECASE)

    # Remove "Chapter ..." repeated headers (even broken ones)
    text = re.sub(r'Chapter\s+The.*?Solutions', '', text, flags=re.IGNORECASE)

    # Remove random numbers/dates like //2288//22002255 ::0066::3366
    text = re.sub(r'[/:\d]+', '', text)

    # Collapse multiple punctuation (.... → . , ??? → ? , !!! → !)
    text = re.sub(r'([.?!])\1+', r'\1', text)

    # Remove bullets/list markers
    text = re.sub(r'[\\•\·\●\-\–\—\»\"z]', '', text)

    # Normalize ligatures
    text = text.replace('ﬁ', 'fi').replace('ﬂ', 'fl')

    return text.strip()


def pdf_to_sentences(pdf_path: str):
    """Extract sentences from PDF and clean them."""
    all_sentences = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            raw_text = page.extract_text()
            if not raw_text:
                continue
            # Clean and split into sentences
            cleaned = clean_text(raw_text)
            sentences = sent_tokenize(cleaned)
            for s in sentences:
                s = s.strip()
                if len(s.split()) > 5:  # skip very tiny fragments
                    all_sentences.append(s)
    return all_sentences

In [10]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
ch9_sentences = pdf_to_sentences("/content/drive/MyDrive/ColabContent/ch9.pdf")

In [12]:
import faiss
import numpy as np

ch9_embeddings = model.encode(ch9_sentences, convert_to_tensor=False) # Convert to numpy array for FAISS

# create a FAISS index
d = ch9_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(ch9_embeddings)

print(f"Created FAISS index with {index.ntotal} vectors.")

Created FAISS index with 323 vectors.


In [13]:
my_question = "What is a solute and a solvent?"
my_question_embd = model.encode(my_question, convert_to_tensor=True)

In [14]:
import torch

# Encode the question and convert to numpy
my_question_embd_np = model.encode([my_question], convert_to_tensor=False)

# Search the FAISS index
top_k = 5
distances, indices = index.search(my_question_embd_np, top_k)

print("Question:", my_question)
print(f"Top {top_k} Matches:")
for i in range(top_k):
    print(f"- {ch9_sentences[indices[0][i]]}")

Question: What is a solute and a solvent?
Top 5 Matches:
- What Are Solute, Solvent, and Solution?
- In a solution formed by mixing two liquids, the component present in less quantity is known as solute and the other component is called solvent.
- Solvent Solute + Solvent Solution When a solution is formed by mixing two liquids, it is not always clear which substance is dissolving the other.
- Magnified schematic picture of a cases, the substance present in smaller solute evenly distributed in a solvent amount is called the solute, while the one in larger amount is called the solvent.
- Whenever a solid is mixed with a liquid to form a solution, the solid component is called the solute, and the liquid component is called the solvent.
