In [None]:
from unstructured.partition.pdf import partition_pdf
from sentence_transformers import SentenceTransformer
import sqlite3
import json
import sqlite3
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
import torch
import sounddevice as sd
from transformers import pipeline
import openai
import my_secrets
import subprocess
import os

# STEP-1
This code extracts text and metadata from a PDF file using the `unstructured` library.

1. **PDF Partitioning**: It uses `partition_pdf` to read and parse the provided PDF file (`"Physics 9.pdf"`).
2. **Extract Elements**: It extracts the elements from the PDF into a list of objects, each representing a distinct part of the PDF (such as text or images).
3. **Chunk Creation**: For each element, a dictionary is created containing:
   - `type`: The class name of the element (e.g., title, narrative text).
   - `text`: The actual text content of the element.
   - `page_number`: The page number where the element is located (if available).
4. **Output**: Finally, it prints the first chunk from the list to verify the extracted data.


# STEP-2
This code generates embeddings for each chunk of text using a pre-trained Sentence-BERT model.

1. **Model Loading**: The `SentenceTransformer` class is used to load the pre-trained model `"all-MiniLM-L6-v2"`, which is a lightweight model for generating sentence embeddings.
2. **Embedding Generation**: For each chunk in the `chunks` list, the text is passed through the model to generate a vector (embedding) that represents the semantic meaning of the text.
3. **Storing Embeddings**: The resulting embeddings, which are in the form of a NumPy array, are converted into a list (`tolist()`) to make them suitable for storage or further processing.


# STEP-3
This code connects to an SQLite database and stores chunk data, including text, type, page number, and embeddings, into a table.

1. **Database Connection**: The `sqlite3.connect()` method is used to connect to the SQLite database named `"BOOK_VISION_CHUNKS.db"`. If the database does not exist, it will be created.
2. **Table Creation**: A `chunks` table is created if it does not already exist. The table has columns for `id`, `type`, `text`, `page_number`, and `embedding`. The `embedding` column stores the embedding as a JSON string.
3. **Data Insertion**: For each chunk in the `chunks` list, the `INSERT INTO` SQL statement is executed to store the chunk's data (type, text, page number, and embedding) into the table. The `embedding` is stored as a JSON string using `json.dumps()`.
4. **Commit and Close**: After inserting the data, the changes are committed to the database, and the connection is closed.



# STEP-4
This code demonstrates how to use FAISS (Facebook AI Similarity Search) to index and store text embeddings for efficient similarity search.
### 📌 FAISS Indexing for Text Chunk Retrieval

This script loads text chunk embeddings from an SQLite database (`BOOK_VISION_CHUNKS.db`), converts them into a NumPy array, and indexes them using **FAISS** for efficient similarity search. 
It utilizes **L2 distance** for nearest neighbor search and maps embeddings with their corresponding chunk IDs. 
The FAISS index is then saved to a file (`BOOK_VISION_FAISS_INDEX.bin`) and reloaded for future queries. 
Finally, it prints the total indexed embeddings and their shape.

### Upload PDF

In [None]:
elements = partition_pdf("BOOK.pdf", strategy="hi_res")

# Create chunks with page numbers
chunks = []

for element in elements:
    # Assign the current page number to the chunk
    chunk_data = {
        'type': element.__class__.__name__,
        'text': element.text,
        'page_number': element.metadata.to_dict().get("page_number", "Unknown")  # Assign the page number
    }
    chunks.append(chunk_data)

In [None]:
print("Number of chunks : ", len(chunks))
for chunk in chunks[0:100]:
    print(chunk)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

print("Number of chunks ",len(chunks))
# Generate embeddings for each chunk
for chunk in chunks:
    chunk["embedding"] = model.encode(chunk["text"]).tolist()  # Convert numpy array to list for storage

In [None]:
# Connect to SQLite database (or create one if it doesn't exist)
conn = sqlite3.connect("BOOK_VISION_CHUNKS.db")
cursor = conn.cursor()

# Create table to store chunks and embeddings
cursor.execute("""
CREATE TABLE IF NOT EXISTS chunks (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    type TEXT,
    text TEXT,
    page_number INTEGER,
    embedding TEXT  -- Store as JSON string
)
""")

# Insert chunk data into database
for chunk in chunks:
    cursor.execute("""
    INSERT INTO chunks (type, text, page_number, embedding) VALUES (?, ?, ?, ?)
    """, (chunk["type"], chunk["text"], chunk["page_number"], json.dumps(chunk["embedding"])))  # Store embedding as JSON

# Commit and close
conn.commit()
conn.close()

In [None]:
embedding_list = []

ids = []

connection = sqlite3.connect('BOOK_VISION_CHUNKS.db')  # Reopen the connection if it was closed
cursor = connection.cursor()

cursor.execute("SELECT id, embedding FROM chunks")
rows = cursor.fetchall()

for row in rows:
    chunk_id = row[0]
    embedding = json.loads(row[1])  # Convert JSON string back to a list
    embedding_list.append(embedding)
    ids.append(chunk_id)

# Convert to numpy arrays
embedding_array = np.array(embedding_list, dtype=np.float32)
ids_array = np.array(ids, dtype=np.int64)  # Store actual chunk IDs

# Normalize embeddings before indexing
faiss.normalize_L2(embedding_array)  # Normalize the embeddings
# Initialize FAISS index
embedding_dimension = embedding_array.shape[1]
index = faiss.IndexFlatIP(embedding_dimension)

# Create ID-based FAISS index
index_with_ids = faiss.IndexIDMap(index)
index_with_ids.add_with_ids(embedding_array, ids_array)  # Store IDs inside FAISS

# Save FAISS index
faiss.write_index(index_with_ids, "BOOK_VISION_FAISS_INDEX.bin")

index = faiss.read_index("BOOK_VISION_FAISS_INDEX.bin")

print(index)

print(f"FAISS Index Size: {index.ntotal}")

print("Database embedding shape:", embedding_array.shape)

### Search Methods

In [None]:
def filter_chunks_by_word_count(chunks, min_words=5):
    """Filter out chunks that have fewer than `min_words` words."""
    filtered_chunks = []
    for chunk in chunks:
        word_count = len(chunk["text"].split())  # Count words in the chunk
        if word_count >= min_words:
            filtered_chunks.append(chunk)
    return filtered_chunks

def filter_chunks_by_type(chunks, exclude_types=["title", "heading"]):
    """Filter out chunks of specific types."""
    filtered_chunks = []
    for chunk in chunks:
        if chunk["type"] not in exclude_types:
            filtered_chunks.append(chunk)
    return filtered_chunks

def sort_chunks_by_length(chunks):
    """Sort chunks by word count in descending order."""
    return sorted(chunks, key=lambda x: len(x["text"].split()), reverse=True)


def search_similar_chunks(query, top_k=50, min_words=5, exclude_types=["title", "heading"]):
    """Retrieve top_k most similar chunks from SQLite using FAISS and filter by word count and type."""
    
    # Step 1: Convert query to embedding
    query_vector = model.encode(query).reshape(1, -1)
    
    # Normalize the query embedding
    faiss.normalize_L2(query_vector)
    
    print("Query embedding shape:", query_vector.shape)
    
    # Step 2: Use FAISS to find nearest embeddings
    distances, indices = index.search(query_vector, top_k)  # FAISS returns cosine similarity scores

    # Step 3: Retrieve corresponding chunks from SQLite
    results = []
    for faiss_index in indices[0]:  # FAISS returns indices
        if faiss_index == -1:
            continue  # Skip invalid index
        
        # Fetch the actual database row ID corresponding to FAISS index
        cursor.execute("SELECT id FROM chunks LIMIT 1 OFFSET ?", (int(faiss_index),))
        row_id = cursor.fetchone()
        if row_id:
            cursor.execute("SELECT id, type, text, page_number FROM chunks WHERE id=?", (row_id[0],))
            row = cursor.fetchone()
            if row:
                results.append({"id": row[0], "type": row[1], "text": row[2], "page_number": row[3]})

    # Step 4: Filter chunks by word count and type
    filtered_results = filter_chunks_by_word_count(results, min_words=min_words)
    filtered_results = filter_chunks_by_type(filtered_results, exclude_types=exclude_types)
    
    return filtered_results


### Hybrid Search
# Perform hybrid search
def hybrid_search(query, top_k=20, min_words=5, exclude_types=[]):
    """Retrieve top_k most similar chunks using hybrid search and filter by word count and type."""
    
    # Create a BM25 index for keyword-based search
    corpus = [chunk["text"] for chunk in chunks]
    bm25 = BM25Okapi(corpus)

    # Step 1: Semantic search with FAISS
    faiss_results = search_similar_chunks(query, top_k, min_words=min_words, exclude_types=exclude_types)
    
    # Step 2: Keyword search with BM25
    tokenized_query = query.split(" ")
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_indices = np.argsort(bm25_scores)[-top_k:][::-1]  # Get top_k indices
    
    # Step 3: Fetch BM25 results with the same structure as FAISS results
    bm25_results = []
    for idx in bm25_indices:
        if idx < len(chunks):  # Ensure the index is within bounds
            chunk = chunks[idx]
            bm25_results.append({
                "id": idx,  # Use the index as the ID (or fetch the actual ID from the database if needed)
                "type": chunk["type"],
                "text": chunk["text"],
                "page_number": chunk["page_number"]
            })
    
    # Step 4: Combine results
    combined_results = faiss_results + bm25_results
    
    # Step 5: Remove duplicates (if any)
    unique_results = []
    seen_ids = set()
    for result in combined_results:
        if result["id"] not in seen_ids:
            unique_results.append(result)
            seen_ids.add(result["id"])
    
    # Step 6: Filter chunks by word count and type
    filtered_results = filter_chunks_by_word_count(unique_results, min_words=min_words)
    filtered_results = filter_chunks_by_type(filtered_results, exclude_types=exclude_types)
    
    # Step 7: Sort chunks by length and return top_k
    sorted_results = sort_chunks_by_length(filtered_results)
    return sorted_results[:top_k]  # Return exactly top_k chunks

### Input prompt

In [None]:
def record_audio(duration, sample_rate):
    print("Recording... Speak now!")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()
    print("Recording complete.")
    return np.squeeze(audio)

def get_Voice_Input():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
    SAMPLE_RATE = 16000 
    DURATION = 8 
    audio_data = record_audio(DURATION, SAMPLE_RATE)
    transcription = whisper_pipeline({"sampling_rate": SAMPLE_RATE, "raw": audio_data}, generate_kwargs={"language": "en"})['text']
    return transcription

choice = input("Choose input method - Text (T) or Voice (V): ").strip().lower()
if choice == 't':
    user_input = input("Enter your text: ")
elif choice == 'v':
    user_input = get_Voice_Input()
else:
    print("Invalid choice! Defaulting to text input.")
    user_input = input("Enter your text: ")

print("Your input:", user_input)

connection = sqlite3.connect('BOOK_VISION_CHUNKS.db')  # Reopen the connection if it was closed
cursor = connection.cursor()

similar_chunks = hybrid_search(user_input)

print("\n🔍 **Top Relevant Chunks:**")
print(len(similar_chunks))
for chunk in similar_chunks:
    print(f"📄 ID {chunk['id']} - Page {chunk['page_number']} - {chunk['type']}: {chunk['text']}")
cursor.close()
connection.close()

In [None]:

def load_openai_key():
    return my_secrets.OPEN_AI_SECRET_KEY

def call_openai_chat(prompt, model="gpt-3.5-turbo"):
    api_key = load_openai_key()
    client = openai.OpenAI(api_key=api_key)
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are an expert in physics."},
                      {"role": "user", "content": prompt}],
            max_tokens=4000,
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"An error occurred: {e}"

def save_to_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)

def clean_latex_code(latex_content):
    try:
        openai.api_key = load_openai_key()
        prompt = (
            "You are an expert in LaTeX. Your task is to clean and fix the given LaTeX code, ensuring it compiles without any errors or warnings.\n\n"
            "### Instructions:\n"
            "- Fix any syntax issues while **preserving the original structure and content**.\n"
            "- Ensure all required LaTeX packages and dependencies are included.\n"
            "- Remove any unnecessary whitespace, extra newlines, or redundant commands.\n"
            "- Do **not** add, remove, or alter content unless necessary for compilation.\n"
            "- Ensure that Beamer presentations have correctly structured frames and do not include unwanted blank pages.\n"
            "- Provide **only the cleaned and corrected LaTeX code**, with no explanations or comments.\n\n"
            "### LaTeX Code to Clean and Fix:\n"
            f"{latex_content}"
        )

        response = call_openai_chat(prompt)
        response = response.replace("\maketitle", "")  # Remove unwanted title commands
        response = response.replace("\clearpage", "").replace("\\newpage", "")  # Remove page breaks
        return response
    except Exception as e:
        return f"An error occurred during LaTeX cleaning: {e}"


def generate_pdf_from_latex(latex_code, output_filename):
    temp_tex_file = "temp_presentation.tex"
    save_to_file(temp_tex_file, latex_code)

    try:
        # Run pdflatex twice for cross-referencing
        result = subprocess.run(
            ["pdflatex", "-interaction=nonstopmode", temp_tex_file],
            check=True,
            capture_output=True,
            text=True
        )
        print("✅ PDFLaTeX Output:\n", result.stdout)

        # Check if the PDF was created
        if os.path.exists("temp_presentation.pdf"):
            os.rename("temp_presentation.pdf", output_filename)
            print(f"✅ PDF successfully generated and saved as {output_filename}.")
        else:
            print("⚠️ PDF was not generated. Checking LaTeX logs.")

    except subprocess.CalledProcessError as e:
        print("🚨 LaTeX Compilation Error:")
        print("STDOUT:\n", e.stdout)  # Print LaTeX standard output
        print("STDERR:\n", e.stderr)  # Print LaTeX error messages
    except Exception as e:
        print(f"❌ General error during PDF generation: {e}")

    finally:
        # Cleanup LaTeX temporary files
        for ext in ["aux", "log", "out", "tex"]:
            if os.path.exists(f"temp_presentation.{ext}"):
                os.remove(f"temp_presentation.{ext}")



refinement_prompt = (
    "You are a highly knowledgeable physics expert. Your task is to generate a clear, detailed, and well-structured response to my question, ensuring a strong conceptual understanding and a professional, high-quality impression.\n\n"
    f"### My Question:\n{user_input}\n\n"
    "### Instructions:\n"
    "- Carefully analyze the provided reference text and use only **the most relevant parts** that closely relate to the question.\n"
    "- **Ignore any chunks** that are unrelated or do not contribute meaningfully to the response.\n"
    "- Craft a well-structured response that explains the core concept in **simple, precise, and engaging language**.\n"
    "- Use **real-world examples** to illustrate the topic effectively.\n"
    "- Identify and describe any **types, categories, or variations** relevant to the concept.\n"
    "- Incorporate **relevant formulas and equations**, ensuring each variable is clearly defined.\n"
    "- Provide **step-by-step derivations** where necessary to enhance clarity.\n"
    "- Include **key insights, interesting facts, or historical context** to make the explanation more engaging.\n"
    "- Ensure the response is **comprehensive, professional, and insightful** to leave a strong impression.\n"
    "- Conclude with a set of **engaging follow-up questions** based on the explanation, ensuring that each question aligns with a concept already covered in the text.\n\n"
    f"### Reference Text:\n{similar_chunks}\n\n"
    "### Final Output:\n"
    "1. A **refined, structured response** incorporating deep explanations, examples, formulas, and derivations.\n"
    "2. A set of **relevant, thought-provoking tidbit questions** for the student to test their understanding, ensuring each question aligns with a concept already explained in the response."
)



refined_answer = call_openai_chat(refinement_prompt)
print("\nGenerated Answer:")
print(refined_answer)

output_filename = "text_response.txt"
save_to_file(output_filename, refined_answer)

print(f"\nRefined answer saved to {output_filename}.")
# beamer presentation
slide_prompt = (
    "You are an expert in LaTeX and Beamer. Your task is to generate a complete, error-free Beamer presentation based on the given text.\n\n"
    "### Instructions:\n"
    "- Ensure the **LaTeX code is fully compilable** with no errors.\n"
    "- Always include the necessary **\\documentclass{beamer}** and **\\begin{document}** tags.\n"
    "- Structure the presentation into **logically divided frames** for clarity.\n"
    "- Use **visual aids**, such as diagrams, equations, or bullet points, where relevant.\n"
    "- Highlight key points using **bold text, colors, or overlays** where appropriate.\n"
    "- Include **proper slide titles** for each frame to maintain structure.\n"
    "- Ensure mathematical expressions are correctly formatted using **LaTeX math mode**.\n"
    "- Use relevant **blocks (e.g., theorem, definition, example)** where necessary.\n\n"
    "### Content to Convert into Slides:\n"
    f"{refined_answer}\n\n"
    "Generate a complete and structured Beamer LaTeX code that is ready for direct compilation."
)

slide_content = call_openai_chat(slide_prompt)

slide_filename = "presentation.tex"
save_to_file(slide_filename, slide_content)
print(f"\nPresentation slides saved to {slide_filename}.")
cleaned_slide_content = clean_latex_code(slide_content)

cleaned_slide_filename = "cleaned_presentation.tex"
save_to_file(cleaned_slide_filename, cleaned_slide_content)
print(f"\nCleaned slide content saved to {cleaned_slide_filename}.")

# Step 9: Generate a PDF from the cleaned LaTeX code
pdf_filename = "MyResponse.pdf"
generate_pdf_from_latex(cleaned_slide_content, pdf_filename)