In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and returns it as a string.
    
    :param pdf_path: Path to the PDF file
    :return: Extracted text as a string
    """
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        text += page.get_text("text") + "\n\n"

    return text

# Example usage
pdf_path = "/mnt/nvme_disk2/User_data/nb57077k/project/2019BurkovTheHundred-pageMachineLearning.pdf"  # Change to your actual PDF file path
extracted_text = extract_text_from_pdf(pdf_path)

# Save to a text file for verification
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(extracted_text)

print("✅ PDF text extraction complete! Check extracted_text.txt for output.")


✅ PDF text extraction complete! Check extracted_text.txt for output.


In [2]:
import re

def chunk_text(text, max_words=100):
    """
    Splits text into chunks while preserving structure.
    
    - Keeps headings as separate chunks.
    - Ensures meaningful splits by using paragraph structure.
    
    :param text: The extracted text from DOCX.
    :param max_words: Maximum words per chunk.
    :return: List of text chunks.
    """
    chunks = []
    current_chunk = []
    current_length = 0

    paragraphs = text.split("\n")
    print(f"Total paragraphs: {len(paragraphs)}")

    for para in paragraphs:
        para = para.strip()
        if not para:  # Skip empty lines
            continue

        words = para.split()
        
        # Treat headings as separate chunks (assumes headings are short)
        if len(words) < 8 and re.match(r'^[A-Z ]+$', para):
            if current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_length = 0
            chunks.append(para)  # Store heading as its own chunk
            continue

        # If adding paragraph exceeds max_words, save the current chunk
        if current_length + len(words) > max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

        # Add paragraph to chunk
        current_chunk.append(para)
        current_length += len(words)

    # Save the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# ✅ **Correcting the issue**
extracted_text_file = "/mnt/nvme_disk2/User_data/nb57077k/project/extracted_text.txt"

# Read the text from the file
with open(extracted_text_file, "r", encoding="utf-8") as file:
    extracted_text = file.read()  # ✅ Read the actual text content

# Now, process the text with chunking
chunks = chunk_text(extracted_text, max_words=100)

# Display total chunks
print(f"✅ Total Chunks: {len(chunks)}\n")

# Display the first few chunks
for i, chunk in enumerate(chunks[:5]):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*50}")


Total paragraphs: 6151
✅ Total Chunks: 661

Chunk 1:
The Hundred- Page Machine Learning Book Andriy Burkov “All models are wrong, but some are useful.” — George Box The book is distributed on the “read ﬁrst, buy later” principle. Andriy Burkov The Hundred-Page Machine Learning Book - Draft Preface Let’s start by telling the truth: machines don’t learn. What a typical “learning machine” does, is ﬁnding a mathematical formula, which, when applied to a collection of inputs (called “training data”), produces the desired outputs. This mathematical formula also generates the correct outputs for most other inputs (distinct from the training data) on the condition that
--------------------------------------------------
Chunk 2:
those inputs come from the same or a similar statistical distribution as the one the training data was drawn from. Why isn’t that learning? Because if you slightly distort the inputs, the output is very likely to become completely wrong. It’s not how learning in animals

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient & fast

def generate_embeddings(chunks):
    """
    Generates embeddings for text chunks using Sentence-BERT.

    :param chunks: List of text chunks
    :return: NumPy array of embeddings
    """
    embeddings = model.encode(chunks, convert_to_numpy=True)
    return embeddings



# Generate embeddings
chunk_embeddings = generate_embeddings(chunks)

print(f"✅ Generated {len(chunk_embeddings)} embeddings!")
print(f"Example embedding shape: {chunk_embeddings[0].shape}")  # Should be (384,)


  from .autonotebook import tqdm as notebook_tqdm
2025-03-10 19:54:43.656771: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 19:54:43.667972: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741616683.681823  539665 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741616683.685909  539665 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 19:54:43.700464: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

✅ Generated 661 embeddings!
Example embedding shape: (384,)


In [4]:
chunk_embeddings[0]

array([-4.81511764e-02, -1.42902024e-02, -1.35886082e-02, -7.83646386e-03,
        1.54385704e-03,  5.02703227e-02, -4.08427864e-02, -3.52234989e-02,
       -1.31944343e-02,  2.67965458e-02, -1.41763762e-02,  1.09248295e-01,
        1.11532860e-01, -7.28432313e-02, -9.96976420e-02,  7.75088230e-03,
       -9.77185220e-02,  5.15553579e-02, -1.36999916e-02, -9.19218212e-02,
        9.28739086e-03,  3.78383212e-02, -2.86149830e-02,  5.61336316e-02,
       -5.88891022e-02,  2.74046548e-02, -7.71183567e-03, -5.30428486e-03,
        7.60442903e-03, -1.59289353e-02, -4.41234522e-02,  1.03170034e-02,
        3.97144742e-02,  3.88945383e-03, -6.35966659e-02, -1.04297167e-02,
       -2.87223551e-02,  2.33911872e-02,  2.89282706e-02,  5.80605119e-02,
        4.17400673e-02, -6.20848462e-02, -3.35728228e-02,  5.58815077e-02,
        1.35030404e-01,  5.79331890e-02, -5.01126684e-02, -7.08804801e-02,
        1.15623726e-02, -6.14022138e-04, -1.32548496e-01,  3.53116840e-02,
       -2.33557355e-02,  

In [5]:
import faiss

def store_embeddings_faiss(embeddings):
    """
    Stores embeddings in a FAISS index.

    :param embeddings: NumPy array of embeddings
    :return: FAISS index
    """
    d = embeddings.shape[1]  # Get embedding dimension
    index = faiss.IndexFlatL2(d)  # L2 distance (cosine similarity alternative)
    index.add(embeddings)  # Add embeddings to index
    return index

# Store embeddings in FAISS
faiss_index = store_embeddings_faiss(chunk_embeddings)

print("✅ Embeddings stored in FAISS!")


✅ Embeddings stored in FAISS!


In [6]:
def query_faiss(query_text, model, index, chunks, top_k=2):
    """
    Finds the most relevant text chunks for a given query.

    :param query_text: User's query
    :param model: Sentence-BERT model
    :param index: FAISS index
    :param chunks: Original text chunks
    :param top_k: Number of results to return
    :return: List of top retrieved chunks
    """
    query_embedding = model.encode([query_text], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)  # Search FAISS index
    results = [chunks[idx] for idx in indices[0]]
    return results

# Example query
query = "What is Machine learning?"
retrieved_chunks = query_faiss(query, model, faiss_index, chunks)

print("\n🔍 Top Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks):
    print(f"{i+1}. {chunk}\n")



🔍 Top Retrieved Chunks:
1. which, to be useful, rely on a collection of examples of some phenomenon. These examples can come from nature, be handcrafted by humans or generated by another algorithm. Machine learning can also be deﬁned as the process of solving a practical problem by 1) gathering a dataset, and 2) algorithmically building a statistical model based on that dataset. That statistical model is assumed to be used somehow to solve the practical problem. To save keystrokes, I use the terms “learning” and “machine learning” interchangeably. 1.2 Types of Learning Learning can be supervised, semi-supervised, unsupervised and reinforcement. 1.2.1 Supervised Learning

2. As you can see, just like artiﬁcial intelligence is not intelligence, machine learning is not learning. However, machine learning is a universally recognized term that usually refers to the science and engineering of building machines capable of doing various useful things without being explicitly programmed to do 

In [22]:
import llama_cpp
import os

print("Llama-CPP Installed At:", os.path.dirname(llama_cpp.__file__))


Llama-CPP Installed At: /mnt/nvme_disk2/User_data/nb57077k/miniconda3/envs/neeraj/lib/python3.9/site-packages/llama_cpp


In [29]:
import json
import requests
def query_ollama_rag(query, retrieved_chunks):
    """
    Sends a retrieval-augmented query to the running LLaMA 3 model via Ollama API.
    
    :param query: User's question
    :param retrieved_chunks: Retrieved text chunks from FAISS
    :return: AI-generated structured response
    """

    # Combine retrieved chunks into a context string
    context = "\n\n".join(retrieved_chunks)

    # Define structured prompt for RAG retrieval
    prompt = f"""
    You are an AI assistant. Answer the following query using the provided context. 
    Ensure that the answer is structured, detailed, and well-explained.

    Query:{query}

    Context: {context}
    """
    url = "http://localhost:11434/api/generate"
    data = {
        "model": "llama3.3:70b-instruct-q6_K",
        "prompt": prompt,
        "stream": False
    }
 
    headers = {"Content-Type": "application/json"}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    return response.json()["response"]


# Example Usage
query = "What is Machine Learning?"
retrieved_chunks = query_faiss(query, model, faiss_index, chunks)
structured_answer = query_ollama_rag(query, retrieved_chunks)
print(structured_answer)


**Introduction to Machine Learning**

Machine learning is a subfield of artificial intelligence that involves the development of algorithms and statistical models that enable machines to perform tasks without being explicitly programmed. The term "machine learning" is often used interchangeably with "learning," although it is essential to note that machine learning is not literally learning, but rather a process that draws inspiration from the way animals learn.

**Definition of Machine Learning**
--------------------------------

Machine learning can be defined as the process of solving practical problems by:

1. **Gathering a dataset**: Collecting examples of a phenomenon, which can come from nature, be handcrafted by humans, or generated by another algorithm.
2. **Algorithmically building a statistical model**: Using the collected dataset to build a statistical model that can be used to solve the practical problem.

**Types of Machine Learning**
---------------------------

There ar