In [None]:
import torch
print(torch.cuda.is_available())



In [None]:
import os
# !pip install PyMuPDF
# !pip install tqdm
# !pip install accelerate
# !pip install bitsandbytes
# !pip install flash-attn --no-build-isolation

In [None]:

# !pip install -U transformers sentence-transformers

In [None]:
import os
import fitz  
import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm

# Set Tesseract path (Windows)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

pdf_folder = "data"
all_text = ""

def extract_text_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text.strip()

def extract_text_ocr(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page, lang="eng")
    return text.strip()

pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

for pdf in tqdm(pdf_files, desc="Processing PDFs"):
    path = os.path.join(pdf_folder, pdf)
    print(f"\n Processing {pdf}...")

    text = extract_text_pymupdf(path)

    # If text looks corrupted or too short → use OCR
    if len(text) < 1000 or "❈" in text:
        print(" Using OCR for this file...")
        text = extract_text_ocr(path)

    all_text += f"\n\n========== {pdf} ==========\n\n{text}"

print("\n All PDFs combined!")
print(all_text[:200])  # preview


In [None]:
import sys
print(sys.executable)


In [None]:
import fitz
import os
import pytesseract
from pdf2image import convert_from_path
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    return text.replace('\n', ' ').strip()

def extract_text_ocr(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page, lang="eng")
    return text

def open_and_read_pdf(pdf_path: str) -> list[dict]:

    doc = fitz.open(pdf_path)
    pages_and_text = []

    for page_number, page in tqdm(enumerate(doc), total=len(doc)):
        text = page.get_text()
        text = text_formatter(text)

        # If text looks corrupted → fallback to OCR for that page
        if len(text) < 50 or "❈" in text:
            print(f"OCR used on page {page_number+1} of {os.path.basename(pdf_path)}")
            images = convert_from_path(pdf_path, dpi=300, first_page=page_number+1, last_page=page_number+1)
            text = pytesseract.image_to_string(images[0], lang="eng")
            text = text_formatter(text)

        pages_and_text.append({
            'page_number': page_number,
            'page_char_count': len(text),
            'page_word_count': len(text.split()),
            'page_sentence_count': len(text.split('. ')),
            'page_token_count': len(text) / 4,
            'text': text
        })

    return pages_and_text


In [None]:
pdf_folder = "data"
all_pages = []

for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, file)
        print(f"\nProcessing {file}")
        pages = open_and_read_pdf(pdf_path)
        all_pages.extend(pages)

all_pages[:2]


In [None]:
import random
random.sample(all_pages, 2)

In [None]:
import pandas as pd
df  = pd.DataFrame(all_pages)
df.head(3)

In [None]:
df.describe().round(2)

In [None]:
 # Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# assuming your dataframe is named df
cols = ["page_char_count", "page_word_count", "page_sentence_count", "page_token_count"]

plt.figure(figsize=(12, 8))
for i, col in enumerate(cols, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[col], kde=True, bins=20)
    plt.title(col.replace("_", " ").title())
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x=df["page_token_count"])
plt.title("Page Token Count Distribution")
plt.show()


In [None]:
# Top 5 longest pages by tokens
df.nlargest(5, "page_token_count")[["page_number", "page_token_count", "page_word_count", "page_char_count"]]


In [None]:
# Top 5 shortest pages by tokens
df.nsmallest(5, "page_token_count")[["page_number", "page_token_count", "page_word_count", "page_char_count"]]


In [None]:
import numpy as np
def estimate_chunks(tokens, chunk_size=300, overlap=50):
    step = chunk_size - overlap
    if step <= 0:
        raise ValueError("overlap must be smaller than chunk_size")
    # number of chunks to cover tokens
    return int(np.ceil(max(tokens - overlap, 0) / step))

df["est_chunks_300_50"] = df["page_token_count"].apply(lambda t: estimate_chunks(t, 300, 50))
df["est_chunks_250_40"] = df["page_token_count"].apply(lambda t: estimate_chunks(t, 250, 40))
df["est_chunks_350_60"] = df["page_token_count"].apply(lambda t: estimate_chunks(t, 350, 60))

df[["page_number", "page_token_count", "est_chunks_300_50", "est_chunks_250_40", "est_chunks_350_60"]].head()


In [None]:
df[["est_chunks_300_50", "est_chunks_250_40", "est_chunks_350_60"]].describe()


In [None]:
# Chunking Strategy
# 5 types of chunkings
# 1 Fized Size chunking
# 2 Semantic Chunking
# 3 Recursive
# 4 Stractural
# 5 LLM Chunking




In [None]:
# Fixed Size chhunking
# define size may be 200
# Fast Processing,
# it losses some info, semantic meanign break, lost context

In [None]:
# Semantic Chunking
# First define level of organization:
#           1. Sentence level and compare sentence 1 n 2 n so on
#               sentence 1 wil convert to vector embeding
#               sentence 2 will convert as vector embeding
#               check similarity score between this two embeding with some thresold value may be > 0.7
#               if similarity score greater then the thresold value define then add that sentence to chunk 1
#               Then define 2nd chunk n so on
#        Advantage: Improve Quality
#        Disadvantage: High Complexity and compute , Thresold sensity



In [None]:
# 3 Stractural Chunking (only if data is stractured)
# Split the report into chunks
# Chunk A: Letter of share holders
# Chunk B: Introduction
# Chunk C: Company Overview  and so on
# Adcantage: Fast Good for Stractured Data
# Disadvantage : Huge chunks hallusination problems

In [None]:
# Recursive Chunking :
#       chunk stractual if token greater then max size define then chunk again into sentence or paragraph wise
#

In [None]:
# LLM Chunking
#

# Testing  Chunking Strategies

In [None]:
def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200) -> list[str]:
    """
    Chunk by character length (approx tokens). overlap is in characters.
    """
    text = " ".join(text.split())  # normalize whitespace
    chunks = []
    start = 0
    n = len(text)

    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        if end == n:
            break
        # move back by overlap, but don't go negative
        start = max(end - overlap, 0)

    return chunks


def chunk_pdf_pages(pages_and_text: list, chunk_size: int = 1200, overlap: int = 200) -> list[dict]:
    all_chunks = []
    for page in pages_and_text:
        page_number = page["page_number"]
        page_text = page["text"]

        chunks = chunk_text(page_text, chunk_size=chunk_size, overlap=overlap)
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "page_number": page_number,
                "chunk_index": i,
                "chunk_char_count": len(chunk),
                "chunk_word_count": len(chunk.split()),
                "chunk_token_count": len(chunk) / 4,  # rough estimate
                "chunk_text": chunk
            })
    return all_chunks


# Example: ~300 tokens ≈ 1200 chars, overlap ~50 tokens ≈ 200 chars
chunked_pages = chunk_pdf_pages(all_pages, chunk_size=1200, overlap=200)
print("Total chunks:", len(chunked_pages))
print("First chunk page:", chunked_pages[0]["page_number"], chunked_pages[0]["chunk_text"][:200])


In [None]:
# ================================
# CHUNK ANALYSIS, VISUALIZATION
# + RANDOM CHUNK TEXT PREVIEW
# ================================

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import random
import re

# ----------------
# Convert to DataFrame
# ----------------
df_chunks = pd.DataFrame(chunked_pages)

# ----------------
# Basic sanity checks
# ----------------
print("Total Chunks:", len(df_chunks))
print("Total Pages:", df_chunks["page_number"].nunique())
print("Empty chunks:", (df_chunks["chunk_text"].str.strip() == "").sum())

# ----------------
# Aggregate Metrics
# ----------------
total_chars = int(df_chunks["chunk_char_count"].sum())
total_words = int(df_chunks["chunk_word_count"].sum())
total_tokens = int(df_chunks["chunk_token_count"].sum())

print("\n--- Overall Metrics ---")
print("Total Characters:", total_chars)
print("Total Words:", total_words)
print("Total Tokens (approx):", total_tokens)

# ----------------
# Statistical Summary
# ----------------
print("\n--- Chunk Statistics ---")
display(
    df_chunks[["chunk_char_count", "chunk_word_count", "chunk_token_count"]].describe()
)

# ----------------
# Distribution Plots
# ----------------
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].hist(df_chunks["chunk_char_count"], bins=30)
axes[0].set_title("Characters per Chunk")

axes[1].hist(df_chunks["chunk_word_count"], bins=30)
axes[1].set_title("Words per Chunk")

axes[2].hist(df_chunks["chunk_token_count"], bins=30)
axes[2].set_title("Tokens per Chunk")

for ax in axes:
    ax.set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# ----------------
# Chunk Index vs Word Count
# ----------------
plt.figure(figsize=(6, 4))
plt.scatter(df_chunks["chunk_index"], df_chunks["chunk_word_count"], alpha=0.6)
plt.xlabel("Chunk Index")
plt.ylabel("Word Count")
plt.title("Chunk Index vs Word Count")
plt.show()

# ----------------
# Word Frequency Analysis
# ----------------
# Basic cleanup and stopword removal to reduce noise
stopwords = {
    "the", "and", "a", "an", "of", "to", "in", "is", "it", "for", "on", "that", "this",
    "with", "as", "are", "was", "be", "by", "or", "from", "at", "which", "we", "you"
}

tokens = re.findall(r"[a-zA-Z]{2,}", " ".join(df_chunks["chunk_text"]).lower())
tokens = [t for t in tokens if t not in stopwords]

word_freq = Counter(tokens)
word_freq_df = (
    pd.DataFrame(word_freq.items(), columns=["word", "count"])
    .sort_values(by="count", ascending=False)
)

print("\n--- Top 20 Frequent Words (cleaned) ---")
display(word_freq_df.head(20))

# ----------------
# Top Words Visualization
# ----------------
top_words = word_freq_df.head(20)

plt.figure(figsize=(6, 4))
plt.barh(top_words["word"], top_words["count"])
plt.xlabel("Frequency")
plt.title("Top 20 Words Across All Chunks")
plt.gca().invert_yaxis()
plt.show()

# =================================================
# RANDOM CHUNK TEXT INSPECTION
# =================================================
def show_random_chunks(df, num_chunks=3, max_sentences=2, seed=42):
    print("\n--- Sample Chunk Content Preview ---\n")
    rng = random.Random(seed)
    random_indices = rng.sample(range(len(df)), min(num_chunks, len(df)))

    for idx in random_indices:
        chunk = df.iloc[idx]
        text = chunk["chunk_text"]

        sentences = re.split(r"(?<=[.!?])\s+", text)
        words = text.split()

        print(f"Chunk ID: {idx}")
        print(f"Page: {chunk['page_number']} | Chunk Index: {chunk['chunk_index']}")
        print(f"Chars: {chunk['chunk_char_count']} | Words: {chunk['chunk_word_count']}")
        print("\nExample Sentences:")
        for s in sentences[:max_sentences]:
            print("-", s)
        print("\nExample Words:")
        print(words[:20])
        print("\n" + "-" * 60 + "\n")

# ----------------
# Show random chunk samples
# ----------------
show_random_chunks(df_chunks, num_chunks=3)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame(chunked_pages)
df.describe().round(2)

# EMbeding

In [None]:
import torch
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0).total_memory / 1024**3, "GB")


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/e5-base-v2", device="cuda")

passages = ["passage: " + c["chunk_text"] for c in chunked_pages]
embeddings = model.encode(
    passages,
    batch_size=16,
    show_progress_bar=True,
    normalize_embeddings=True
)


# SAve Embeding to File

In [None]:
import numpy as np
import faiss
import pandas as pd

# build index
embeddings_np = np.array(embeddings, dtype="float32")
dim = embeddings_np.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings_np)

print("FAISS index size:", index.ntotal)

# save index + metadata
faiss.write_index(index, "faiss.index")
pd.DataFrame(chunked_pages).to_csv("chunks.csv", index=False)


In [None]:
import faiss
import pandas as pd

index = faiss.read_index("faiss.index")
chunked_pages = pd.read_csv("chunks.csv").to_dict("records")

print("FAISS index size:", index.ntotal)


In [None]:
import faiss
import pandas as pd

# load index + chunks
index = faiss.read_index("faiss.index")
chunked_pages = pd.read_csv("chunks.csv").to_dict("records")

# now you can use search() and answer_question()


In [None]:
import pandas as pd
import json

df = pd.DataFrame(chunked_pages)
df["embedding"] = [json.dumps(e.tolist()) for e in embeddings]

df.to_csv("chunks_with_embeddings.csv", index=False)

# load + preview
df_loaded = pd.read_csv("chunks_with_embeddings.csv")

# convert back to list if needed
df_loaded["embedding"] = df_loaded["embedding"].apply(json.loads)

df_loaded.sample(5)


#Convert Embeding to Tensors

# 5 Retrival Step

In [None]:
def search(query, top_k=5):
    # E5 requires query prefix
    q = "query: " + query
    q_emb = model.encode([q], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")

    scores, idxs = index.search(q_emb, top_k)

    results = []
    for score, idx in zip(scores[0], idxs[0]):
        item = chunked_pages[idx]
        results.append({
            "score": float(score),
            "page": item["page_number"],
            "chunk_index": item["chunk_index"],
            "text": item["chunk_text"][:400]
        })
    return results


In [None]:
results = search("What will happen if we kill all the organisms in one trophic level?", top_k=5)
for r in results:
    print(r["score"], "page", r["page"], "chunk", r["chunk_index"])
    print(r["text"])
    print("-" * 60)


# Part 6 Generation Of RAG

In [None]:
# import sys
# !{sys.executable} -m pip install openai

# import sys
# !{sys.executable} -m pip install python-dotenv


from openai import OpenAI
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv() 

client = OpenAI()





In [None]:
def search(query, top_k=5):
    q = "query: " + query
    q_emb = model.encode([q], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")

    scores, idxs = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(scores[0], idxs[0]):
        item = chunked_pages[idx]
        results.append({
            "idx": int(idx),
            "score": float(score),
            "page": item["page_number"],
            "chunk_index": item["chunk_index"],
            "text": item["chunk_text"]
        })
    return results


def expand_with_neighbors(results, neighbor=1):
    idxs = set()
    for r in results:
        idxs.add(r["idx"])
        for n in range(1, neighbor + 1):
            idxs.add(r["idx"] + n)
            idxs.add(r["idx"] - n)

    idxs = sorted(i for i in idxs if 0 <= i < len(chunked_pages))
    expanded = []
    for i in idxs:
        item = chunked_pages[i]
        expanded.append({
            "idx": i,
            "page": item["page_number"],
            "chunk_index": item["chunk_index"],
            "text": item["chunk_text"]
        })
    return expanded


def answer_question(query, top_k=5, neighbor=1, model_name="gpt-4o-mini"):
    # retrieve + expand
    base_results = search(query, top_k=top_k)
    results = expand_with_neighbors(base_results, neighbor=neighbor)

    # build context
    context_blocks = []
    for i, r in enumerate(results, 1):
        meta = f"[{i}] (page {r['page']}, chunk {r['chunk_index']})"
        context_blocks.append(meta + "\n" + r["text"])
    context = "\n\n---\n\n".join(context_blocks)

    system_msg = (
        "You are a helpful assistant for a class 10 science notebook. "
        "Answer only from the provided context. "
        "Use inline citations like [1], [2]. "
        "If the answer is not in the context, say: 'I don't know from the provided notes.'"
    )

    user_msg = f"Question: {query}\n\nContext:\n{context}"

    response = client.responses.create(
        model=model_name,
        input=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
    )

    return response.output_text, results


In [None]:
answer, sources = answer_question(
    "What will happen if we kill all the organisms in one trophic level?",
    top_k=5,
    neighbor=1
)
print(answer)
