<a href="https://colab.research.google.com/github/LuckyBoy587/SchoolaNova/blob/main/BERT_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sentence_transformers import SentenceTransformer, util

# 1. Load embedding model (pretrained on similarity tasks)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


# 2. Sample NCERT-like paragraphs
docs = [
    "Photosynthesis is the process by which green plants make food using sunlight, carbon dioxide, and water.",
    "Respiration is the process by which living organisms release energy from food molecules.",
    "Chlorophyll is a green pigment in plants that helps capture sunlight for photosynthesis."
]

# 3. Encode the paragraphs
doc_embeddings = model.encode(docs, convert_to_tensor=True)

# 4. Encode a sample question
question = "How do plants cook themselves?"
query_embedding = model.encode(question, convert_to_tensor=True)

# 5. Compute similarity
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)
print(cosine_scores)

# 6. Retrieve best paragraph
best_idx = cosine_scores.argmax()
print("Question:", question)
print("Best Match:", docs[best_idx])


In [None]:
def split_into_chunks(text):
    return text.splitlines()

In [None]:
file_chunks = []
with open("/content/drive/MyDrive/ColabContent/photosynthesis.txt") as f:
  file_chunks = split_into_chunks(f.read())
  print(*file_chunks, sep='\n')

In [None]:
query = "Where did the name photosysnthesis came from?"

In [None]:
chunk_embeddings = model.encode(file_chunks, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)
cosine_scores = util.cos_sim(query_embedding, chunk_embeddings)
best_idx = cosine_scores.argmax()

print("Question:", query)
print("Best Match:", file_chunks[best_idx])

In [None]:
!pip install pdfplumber

In [None]:
import pdfplumber, re
from nltk.tokenize import sent_tokenize

def clean_text(text: str) -> str:
    """Aggressively clean NCERT/School-book style PDF text with OCR noise."""
    # Normalize spaces/newlines
    text = re.sub(r'\s+', ' ', text)

    # Remove OCR garbled "CChhaapptteerr" like strings
    text = re.sub(r'C+H*A+P+T+E+R+.*?\d+', '', text, flags=re.IGNORECASE)

    # Remove lines with .indd and timestamps (OCR file tags)
    text = re.sub(r'\.?i+n+d+d+\s*\d+.*?(AM|PM)?', '', text, flags=re.IGNORECASE)

    # Remove publisher footer lines
    text = re.sub(r'Curiosity.*?(Grade|Gr\.a\.d\.e)', '', text, flags=re.IGNORECASE)

    # Remove "Chapter ..." repeated headers (even broken ones)
    text = re.sub(r'Chapter\s+The.*?Solutions', '', text, flags=re.IGNORECASE)

    # Remove random numbers/dates like //2288//22002255 ::0066::3366
    text = re.sub(r'[/:\d]+', '', text)

    # Collapse multiple punctuation (.... → . , ??? → ? , !!! → !)
    text = re.sub(r'([.?!])\1+', r'\1', text)

    # Remove bullets/list markers
    text = re.sub(r'[\\•\·\●\-\–\—\»\"z]', '', text)

    # Normalize ligatures
    text = text.replace('ﬁ', 'fi').replace('ﬂ', 'fl')

    return text.strip()


def pdf_to_sentences(pdf_path: str):
    """Extract sentences from PDF and clean them."""
    all_sentences = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            raw_text = page.extract_text()
            if not raw_text:
                continue
            # Clean and split into sentences
            cleaned = clean_text(raw_text)
            sentences = sent_tokenize(cleaned)
            for s in sentences:
                s = s.strip()
                if len(s.split()) > 5:  # skip very tiny fragments
                    all_sentences.append(s)
    return all_sentences

In [None]:
print(*pdf_to_sentences("/content/drive/MyDrive/ColabContent/ch9.pdf"), sep='\n')

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
ch9_sentences = pdf_to_sentences("/content/drive/MyDrive/ColabContent/ch9.pdf")

In [None]:
import faiss
import numpy as np

ch9_embeddings = model.encode(ch9_sentences, convert_to_tensor=False) # Convert to numpy array for FAISS

# create a FAISS index
d = ch9_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(ch9_embeddings)

print(f"Created FAISS index with {index.ntotal} vectors.")

In [None]:
my_question = "What is a solute and a solvent?"
my_question_embd = model.encode(my_question, convert_to_tensor=True)

In [None]:
import torch

# Encode the question and convert to numpy
my_question_embd_np = model.encode([my_question], convert_to_tensor=False)

# Search the FAISS index
top_k = 5
distances, indices = index.search(my_question_embd_np, top_k)

print("Question:", my_question)
print(f"Top {top_k} Matches:")
for i in range(top_k):
    print(f"- {ch9_sentences[indices[0][i]]}")

In [None]:
!pip install faiss-cpu