!pip install streamlit faiss-cpu numpy sentence-transformers pymupdf openai pickle-mixin


#genrate and store embeddings

In [15]:
import fitz  # PyMuPDF for PDF processing
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter  #chunk overlap,chunk size
import pickle
import re
import time

# Set directory to save embeddings and text chunks
EMBEDDING_DIR = "/home/harish/Agentic_AI/embeddings"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/text_chunks"  # Directory for text chunks
os.makedirs(EMBEDDING_DIR, exist_ok=True)
os.makedirs(TEXT_CHUNKS_DIR, exist_ok=True)

# Load SBERT model
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

# Load the model before running the code
embedding_model = load_embedding_model()

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(stream=pdf_file, filetype="pdf")
    text = "\n".join([page.get_text("text") for page in doc])
    return text

# Function to generate embeddings using SBERT (batch processing)
def get_embeddings_batch(texts):
    embeddings = embedding_model.encode(texts, show_progress_bar=True, batch_size=16)  # Batch processing
    return embeddings

# Function to improve chunking based on topics and subtopics (headings)
def split_text_by_headings(text):
    heading_pattern = re.compile(r"^[A-Z][A-Za-z0-9\s\-]+:$")  # Simple heading pattern (e.g., "Introduction:")
    chunks = []
    current_chunk = []
    
    for line in text.split('\n'):
        if heading_pattern.match(line):  # If a heading is detected, start a new chunk
            if current_chunk:
                chunks.append("\n".join(current_chunk))
            current_chunk = [line]  # Start new chunk with heading
        else:
            current_chunk.append(line)  # Add line to current chunk
    
    if current_chunk:
        chunks.append("\n".join(current_chunk))  # Add remaining text
    
    return chunks

# Function to process the PDF files
def process_pdf(file_path):
    # Read the PDF
    with open(file_path, 'rb') as f:
        text = extract_text_from_pdf(f.read())
    
    # Split the text based on headings or topics/subtopics
    chunks = split_text_by_headings(text)
    
    # If chunks are too large, further split them using RecursiveCharacterTextSplitter
    all_chunks = []
    for chunk in chunks:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        sub_chunks = text_splitter.split_text(chunk)
        all_chunks.extend(sub_chunks)
    
    # Batch processing of embeddings
    start_time = time.time()
    embeddings = get_embeddings_batch(all_chunks)
    print(f"Embedding generation took {time.time() - start_time:.2f} seconds")
    
    embeddings = np.array(embeddings, dtype=np.float32)
    
    # Save embeddings to .npy file
    save_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(file_path)}.npy")
    np.save(save_path, embeddings)
    print(f"Embeddings saved for {os.path.basename(file_path)}")
    
    # Save text chunks to .pkl file
    text_chunks_path = os.path.join(TEXT_CHUNKS_DIR, f"{os.path.basename(file_path)}_chunks.pkl")
    with open(text_chunks_path, 'wb') as f:
        pickle.dump(all_chunks, f)
    print(f"Text chunks saved for {os.path.basename(file_path)}")

# Process PDF files (adjust the paths to the PDFs)
pdf_files = ["/home/harish/Agentic_AI/books/Current_Essentials_of_Medicine.pdf", "/home/harish/Agentic_AI/books/MedicalDiagnosis_and_Treatment_Methods_in_Basic_Medical_Sciences.pdf"]  # Example PDF file paths
for pdf_file in pdf_files:
    process_pdf(pdf_file)


Batches: 100%|██████████| 162/162 [02:47<00:00,  1.03s/it]


Embedding generation took 167.76 seconds
Embeddings saved for Current_Essentials_of_Medicine.pdf
Text chunks saved for Current_Essentials_of_Medicine.pdf


Batches: 100%|██████████| 51/51 [00:53<00:00,  1.06s/it]

Embedding generation took 54.02 seconds
Embeddings saved for MedicalDiagnosis_and_Treatment_Methods_in_Basic_Medical_Sciences.pdf
Text chunks saved for MedicalDiagnosis_and_Treatment_Methods_in_Basic_Medical_Sciences.pdf





In [16]:
!pip install streamlit faiss-cpu numpy sentence-transformers pymupdf openai pickle-mixin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
import os
import numpy as np
import faiss

# Directory where embeddings are stored
EMBEDDING_DIR = "/home/harish/Agentic_AI/embeddings"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/text_chunks"  # Directory for text chunks

def load_embeddings():
    """
    Loads stored embeddings (.npy) from the embedding directory.
    """
    embeddings_list = []
    text_chunks = []  # This will hold the corresponding text chunks

    # Get the list of .npy files in the EMBEDDING_DIR
    files_found = [f for f in os.listdir(EMBEDDING_DIR) if f.endswith(".npy")]
    
    if not files_found:
        raise FileNotFoundError("⚠️ No `.npy` embedding files found in the specified directory!")

    for file in files_found:
        file_path = os.path.join(EMBEDDING_DIR, file)
        
        # Load the corresponding text chunks (assuming the file name matches)
        text_file = file.replace(".npy", "_chunks.pkl")
        text_path = os.path.join(TEXT_CHUNKS_DIR, text_file)
        
        # Load embeddings
        try:
            embed = np.load(file_path)
            embeddings_list.append(embed)

            # Load text chunks
            with open(text_path, "rb") as f:
                texts = pickle.load(f)
                text_chunks.extend(texts)  # Append texts to list
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")
            continue  # Skip this file if there's an issue

    if not embeddings_list:
        raise ValueError("❌ No valid embeddings found. Check your `.npy` files!")

    embeddings = np.vstack(embeddings_list)  # Stack embeddings into a single numpy array
    return embeddings, text_chunks

def create_faiss_index(embeddings):
    """
    Creates a FAISS index for fast similarity search.
    """
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create FAISS index for L2 distance
    index.add(embeddings)  # Add the embeddings to the FAISS index
    return index

# Load embeddings and create FAISS index
try:
    embeddings, text_chunks = load_embeddings()
    faiss_index = create_faiss_index(embeddings)
    print("✅ FAISS index created with", embeddings.shape[0], "entries.")
except Exception as e:
    print(f"❌ Error: {e}")


✅ FAISS index created with 3397 entries.


In [18]:
def search_faiss_index(query, faiss_index, embeddings, text_chunks, k=5):
    """
    Search the FAISS index for the most similar embeddings to the query.
    
    Parameters:
    - query: The query string to search for
    - faiss_index: The FAISS index object
    - embeddings: All embeddings to compare against
    - text_chunks: Corresponding text chunks to retrieve
    - k: The number of nearest neighbors to retrieve
    
    Returns:
    - top_k_chunks: The top k most similar text chunks
    """
    # Convert query to embedding using the same model (you may want to use the same SentenceTransformer model here)
    query_embedding = embedding_model.encode([query])
    
    # Perform the search on the FAISS index
    distances, indices = faiss_index.search(np.array(query_embedding).astype(np.float32), k)
    
    # Retrieve the most similar text chunks
    top_k_chunks = [text_chunks[i] for i in indices[0]]
    
    return top_k_chunks, distances[0]

# Example query search
query = "What are the treatments for diabetes?"
top_k_chunks, distances = search_faiss_index(query, faiss_index, embeddings, text_chunks)

# Display results
for idx, (chunk, dist) in enumerate(zip(top_k_chunks, distances)):
    print(f"Rank {idx + 1}:")
    print(f"Distance: {dist}")
    print(chunk)
    print("\n" + "-"*50 + "\n")


Rank 1:
Distance: 0.8545056581497192
and cardiovascular disease
■Differential Diagnosis
•
Nondiabetic glycosuria (eg, Fanconi’s syndrome)
•
Diabetes insipidus
•
Acromegaly
•
Cushing’s disease or syndrome
•
Pheochromocytoma
•
Medications (eg, glucocorticoids, niacin)
■Treatment
•
Insulin treatment is required
•
Patient education is crucial, emphasizing dietary management,
intensive insulin therapy, self-monitoring of blood glucose, hypo-
glycemia awareness, foot and eye care
■Pearl

--------------------------------------------------

Rank 2:
Distance: 0.9506320357322693
•
Cushing’s disease or syndrome
•
Pheochromocytoma
•
Medications (eg, glucocorticoids, niacin)
•
Severe insulin resistance syndromes
•
Altered mental status due to other cause
■Treatment
•
Patient education is important, emphasizing dietary management,
exercise, weight loss, self-monitoring of blood glucose, hypo-
glycemia awareness, foot and eye care
•
Mild cases may be controlled initially with diet, exercise, and
weig

In [19]:
def display_search_results(top_k_chunks, distances, max_length=500):
    """
    Display the search results with cleaned-up formatting and truncated text for better readability.
    
    Parameters:
    - top_k_chunks: List of the most similar text chunks
    - distances: List of the corresponding distances for each chunk
    - max_length: Maximum length of the chunk to display before truncating
    """
    for idx, (chunk, dist) in enumerate(zip(top_k_chunks, distances)):
        # Clean up text formatting
        chunk = clean_text_formatting(chunk)
        
        # Truncate chunk to make it more readable
        truncated_chunk = chunk[:max_length] + ("..." if len(chunk) > max_length else "")
        
        # Display results with rank and distance
        print(f"Rank {idx + 1}:")
        print(f"Distance: {dist:.4f}")
        print(f"Excerpt: {truncated_chunk}")
        print("\n" + "-"*50 + "\n")

def clean_text_formatting(text):
    """
    Cleans up the text formatting, removing unnecessary symbols and characters like bullet points.
    """
    # Replace bullet points, symbols, and extra spaces with cleaner text
    cleaned_text = text.replace("■", "").replace("•", "").replace("\n", " ").strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    return cleaned_text


In [20]:
# Example query search (already done)
query = "What are the treatments for diabetes?"
top_k_chunks, distances = search_faiss_index(query, faiss_index, embeddings, text_chunks)

# Display results with improved formatting
display_search_results(top_k_chunks, distances)


Rank 1:
Distance: 0.8545
Excerpt: and cardiovascular disease Differential Diagnosis Nondiabetic glycosuria (eg, Fanconi’s syndrome) Diabetes insipidus Acromegaly Cushing’s disease or syndrome Pheochromocytoma Medications (eg, glucocorticoids, niacin) Treatment Insulin treatment is required Patient education is crucial, emphasizing dietary management, intensive insulin therapy, self-monitoring of blood glucose, hypo- glycemia awareness, foot and eye care Pearl

--------------------------------------------------

Rank 2:
Distance: 0.9506
Excerpt: Cushing’s disease or syndrome Pheochromocytoma Medications (eg, glucocorticoids, niacin) Severe insulin resistance syndromes Altered mental status due to other cause Treatment Patient education is important, emphasizing dietary management, exercise, weight loss, self-monitoring of blood glucose, hypo- glycemia awareness, foot and eye care Mild cases may be controlled initially with diet, exercise, and weight loss

-------------------------------

In [24]:
def search_faiss_index(query, faiss_index, embeddings, text_chunks, top_k=5):
    """
    Retrieves the top-k most relevant text chunks based on the query using FAISS.
    
    Parameters:
    - query: The user's query.
    - faiss_index: The FAISS index to search for relevant embeddings.
    - embeddings: The embeddings of the text chunks.
    - text_chunks: The list of text chunks.
    - top_k: The number of top relevant chunks to retrieve.
    
    Returns:
    - top_k_chunks: The most relevant text chunks based on the query.
    - distances: The distances (similarity scores) of the retrieved chunks.
    """
    # Convert query to embedding
    query_embedding = embedding_model.encode([query])

    # Perform search in FAISS index
    distances, indices = faiss_index.search(np.array(query_embedding).astype(np.float32), top_k)

    # Get the top-k relevant chunks based on the indices
    top_k_chunks = [text_chunks[i] for i in indices[0]]
    
    return top_k_chunks, distances[0]


In [34]:
!pip install langchain
!pip install langchain-google-genai
!pip install langchain-community
!pip install
# pip install faiss-cpu  # or faiss-gpu if you're using GPU
# pip install pandas
!pip install python-dotenv
# pip install streamlit
# pip install PyPDF2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.9-py3-none-any.whl.metadata (3.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-generativeai<0.9.0,>=0.8.0 (from langchain-google-genai)
  Downloading google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai<0.9.0,>=0.8.0->langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai<0.9.0,>=0.8.0->langchain-google-genai)
  Downloading google_api_core-2.24.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai<0.9.0,>=0.8.0->langchain-google-genai)
  Downloading google_api_python_client-2.160.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth>=2.15.0 (from google-generativeai<

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-community
  Downloading langchain_community-0.3.16-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [46]:
import os
import numpy as np
import faiss
import pickle
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv

# Directory for embeddings and text chunks
EMBEDDING_DIR = "/home/harish/Agentic_AI/embeddings"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/text_chunks"

# Load environment variables (API keys, etc.)
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Initialize ChatGroq LLM
llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="Llama3-8b-8192", temperature=0)

def load_embeddings():
    """
    Loads stored embeddings (.npy) and text chunks from their respective directories.
    """
    embeddings_list = []
    text_chunks = []

    files_found = [f for f in os.listdir(EMBEDDING_DIR) if f.endswith(".npy")]
    if not files_found:
        raise FileNotFoundError("⚠️ No `.npy` embedding files found!")

    for file in files_found:
        file_path = os.path.join(EMBEDDING_DIR, file)
        text_file = file.replace(".npy", "_chunks.pkl")
        text_path = os.path.join(TEXT_CHUNKS_DIR, text_file)

        try:
            embed = np.load(file_path)
            embeddings_list.append(embed)

            with open(text_path, "rb") as f:
                texts = pickle.load(f)
                text_chunks.extend(texts)
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")
            continue

    if not embeddings_list:
        raise ValueError("❌ No valid embeddings found.")

    embeddings = np.vstack(embeddings_list)  # Stack embeddings
    return embeddings, text_chunks

def create_faiss_index(embeddings):
    """
    Creates a FAISS index for fast similarity search.
    """
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance metric for FAISS
    index.add(embeddings)  # Add embeddings to FAISS
    return index

# Load embeddings and create FAISS index
try:
    embeddings, text_chunks = load_embeddings()
    faiss_index = create_faiss_index(embeddings)
    print("✅ FAISS index created with", embeddings.shape[0], "entries.")
except Exception as e:
    print(f"❌ Error: {e}")

from sentence_transformers import SentenceTransformer

# Initialize your embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use the model that fits your use case

def search_faiss_index(query, faiss_index, embeddings, text_chunks, k=5):
    """
    Perform a search on the FAISS index for the most similar embeddings to the query.
    """
    # Convert query to embedding using the same model
    query_embedding = embedding_model.encode([query])  # Make sure embedding_model is defined

    # Perform the FAISS search
    distances, indices = faiss_index.search(np.array(query_embedding).astype(np.float32), k)
    
    # Retrieve the most similar text chunks
    top_k_chunks = [text_chunks[i] for i in indices[0]]
    
    return top_k_chunks, distances[0]


def create_chatgroq_prompt(top_k_chunks, query):
    """
    Creates a formatted prompt for ChatGroq using retrieved chunks.
    """
    context = "\n".join(top_k_chunks)
    prompt = ChatPromptTemplate.from_template(
        """
        Answer the following question based on the provided context:
        <context>
        {context}
        </context>
        Question: {input}
        """
    )
    
    formatted_prompt = prompt.format(context=context, input=query)
    return formatted_prompt

def query_chatgroq_with_context(query, faiss_index, embeddings, text_chunks, top_k=5):
    """
    Query ChatGroq with the enhanced context retrieved from FAISS search.
    """
    # Step 1: Retrieve the relevant chunks from FAISS
    top_k_chunks, distances = search_faiss_index(query, faiss_index, embeddings, text_chunks, k=top_k)

    # Step 2: Prepare the prompt for ChatGroq
    formatted_prompt = create_chatgroq_prompt(top_k_chunks, query)

    # Step 3: Query ChatGroq with the enhanced prompt
    response = llm.invoke(formatted_prompt)  # Use the correct method to invoke

    # Debugging: Print the response and its type
    print(f"Response from ChatGroq: {response}")
    print(f"Response type: {type(response)}")  # Check the type of response

    # Assuming the answer is stored in the 'content' attribute
    return response.content  # Or adjust if a different attribute is used

# Example query
query = "what do you know about Acute Liver Failure"

# Query ChatGroq with relevant context from FAISS
answer = query_chatgroq_with_context(query, faiss_index, embeddings, text_chunks, top_k=5)

print(f"Answer from ChatGroq: {answer}")

#unstructure.io

✅ FAISS index created with 3397 entries.
Response from ChatGroq: content="Based on the provided context, here's what I know about Acute Liver Failure:\n\n* Acute Liver Failure is a severe liver injury that occurs in a person with previously normal liver function, associated with the development of hepatic encephalopathy and evidence of hepatic synthetic dysfunction.\n* Patients often present with new-onset jaundice, anorexia, nausea, vomiting, flu-like symptoms, or altered mental status.\n* The etiologies of Acute Liver Failure include:\n\t+ Acetaminophen overdose\n\t+ Idiosyncratic drug reaction\n\t+ Acute viral hepatitis\n\t+ Exposure to hepatotoxins\n\t+ Autoimmune hepatitis\n\t+ Wilson's disease\n\t+ Complications of pregnancy\n\t+ Vascular disorders\n* The diagnosis of Acute Liver Failure is based on the presence of severe liver injury, hepatic encephalopathy, and evidence of hepatic synthetic dysfunction.\n* The treatment of Acute Liver Failure involves:\n\t+ Prompt recognition o

In [4]:
pip install --upgrade unstructured


Note: you may need to restart the kernel to use updated packages.


In [19]:
!pip install unstructured[local-inference]

Collecting unstructured.pytesseract>=0.3.12 (from unstructured[local-inference])
  Downloading unstructured.pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting python-pptx>=1.0.1 (from unstructured[local-inference])
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting unstructured-inference>=0.8.6 (from unstructured[local-inference])
  Downloading unstructured_inference-0.8.7-py3-none-any.whl.metadata (5.3 kB)
Collecting google-cloud-vision (from unstructured[local-inference])
  Downloading google_cloud_vision-3.9.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting effdet (from unstructured[local-inference])
  Downloading effdet-0.4.1-py3-none-any.whl.metadata (33 kB)
Collecting openpyxl (from unstructured[local-inference])
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting markdown (from unstructured[local-inference])
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting pdf2image (from unstructured[lo

In [20]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from unstructured.partition.pdf import partition_pdf  # Use partition_pdf for PDF processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle
import time

# Set directory to save embeddings and text chunks
EMBEDDING_DIR = "/home/harish/Agentic_AI/embeddings"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/text_chunks"  # Directory for text chunks
os.makedirs(EMBEDDING_DIR, exist_ok=True)
os.makedirs(TEXT_CHUNKS_DIR, exist_ok=True)

# Load SBERT model
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

# Load the model before running the code
embedding_model = load_embedding_model()

# Function to extract text from PDFs using partition_pdf
def extract_text_from_pdf(pdf_file):
    # Partition the PDF document using partition_pdf (with "fast" strategy for extractable text)
    elements = partition_pdf(pdf_file, strategy="fast")  # Adjust strategy if needed ("hi_res", "ocr_only")
    
    # Extract the text content from the elements
    document_text = [element.text for element in elements if hasattr(element, 'text')]
    return document_text

# Function to generate embeddings using SBERT (batch processing)
def get_embeddings_batch(texts):
    embeddings = embedding_model.encode(texts, show_progress_bar=True, batch_size=16)  # Batch processing
    return embeddings

# Function to process the PDF files
def process_pdf(file_path):
    # Read the PDF using partition_pdf
    document = extract_text_from_pdf(file_path)
    
    # If chunks are too large, further split them using RecursiveCharacterTextSplitter
    all_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    for chunk in document:
        sub_chunks = text_splitter.split_text(chunk)
        all_chunks.extend(sub_chunks)
    
    # Batch processing of embeddings
    start_time = time.time()
    embeddings = get_embeddings_batch(all_chunks)
    print(f"Embedding generation took {time.time() - start_time:.2f} seconds")
    
    embeddings = np.array(embeddings, dtype=np.float32)
    
    # Save embeddings to .npy file
    save_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(file_path)}.npy")
    np.save(save_path, embeddings)
    print(f"Embeddings saved for {os.path.basename(file_path)}")
    
    # Save text chunks to .pkl file
    text_chunks_path = os.path.join(TEXT_CHUNKS_DIR, f"{os.path.basename(file_path)}_chunks.pkl")
    with open(text_chunks_path, 'wb') as f:
        pickle.dump(all_chunks, f)
    print(f"Text chunks saved for {os.path.basename(file_path)}")

# Process PDF files (adjust the paths to the PDFs)
pdf_files = ["/home/harish/Agentic_AI/books/Current_Essentials_of_Medicine.pdf"]  # Example PDF file paths
for pdf_file in pdf_files:
    process_pdf(pdf_file)


Batches: 100%|██████████| 697/697 [12:29<00:00,  1.08s/it]


Embedding generation took 750.21 seconds
Embeddings saved for Current_Essentials_of_Medicine.pdf
Text chunks saved for Current_Essentials_of_Medicine.pdf


In [23]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from unstructured.partition.pdf import partition_pdf  # Use partition_pdf for PDF processing
from unstructured.chunking.title import chunk_by_title  # Import chunking strategy (by title or by similarity)
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle
import time

# Set directory to save embeddings and text chunks
EMBEDDING_DIR = "/home/harish/Agentic_AI/embeddings"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/text_chunks"  # Directory for text chunks
os.makedirs(EMBEDDING_DIR, exist_ok=True)
os.makedirs(TEXT_CHUNKS_DIR, exist_ok=True)

# Load SBERT model
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

# Load the model before running the code
embedding_model = load_embedding_model()

# Function to extract text from PDFs using partition_pdf
def extract_text_from_pdf(pdf_file):
    # Partition the PDF document using partition_pdf (with "fast" strategy for extractable text)
    elements = partition_pdf(pdf_file, strategy="fast")  # Adjust strategy if needed ("hi_res", "ocr_only")
    
    # Extract the text content from the elements
    document_text = [element.text for element in elements if hasattr(element, 'text')]
    return document_text

# Function to chunk text based on title or similarity
def chunk_text(elements, strategy="by_title"):
    # Choose the chunking strategy (by title or by similarity)
    if strategy == "by_title":
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements)
    elif strategy == "by_similarity":
        from unstructured.chunking.basic import chunk_elements
        chunks = chunk_elements(elements, strategy="by_similarity", similarity_threshold=0.7)
    else:
        raise ValueError("Unknown chunking strategy. Choose 'by_title' or 'by_similarity'.")
    
    return chunks

# Function to generate embeddings using SBERT (batch processing)
def get_embeddings_batch(texts):
    embeddings = embedding_model.encode(texts, show_progress_bar=True, batch_size=16)  # Batch processing
    return embeddings

# Function to process the PDF files
def process_pdf(file_path, chunking_strategy="by_title"):
    # Read the PDF using partition_pdf
    elements = partition_pdf(file_path, strategy="fast")  # Use partition_pdf to extract text
    
    # Chunk the extracted elements by title or similarity
    chunks = chunk_text(elements, strategy=chunking_strategy)
    
    # If chunks are too large, further split them using RecursiveCharacterTextSplitter
    all_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    for chunk in chunks:
        sub_chunks = text_splitter.split_text(chunk.text)
        all_chunks.extend(sub_chunks)
    
    # Batch processing of embeddings
    start_time = time.time()
    embeddings = get_embeddings_batch(all_chunks)
    print(f"Embedding generation took {time.time() - start_time:.2f} seconds")
    
    embeddings = np.array(embeddings, dtype=np.float32)
    
    # Save embeddings to .npy file
    save_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(file_path)}.npy")
    np.save(save_path, embeddings)
    print(f"Embeddings saved for {os.path.basename(file_path)}")
    
    # Save text chunks to .pkl file
    text_chunks_path = os.path.join(TEXT_CHUNKS_DIR, f"{os.path.basename(file_path)}_chunks.pkl")
    with open(text_chunks_path, 'wb') as f:
        pickle.dump(all_chunks, f)
    print(f"Text chunks saved for {os.path.basename(file_path)}")

# Process PDF files (adjust the paths to the PDFs)
pdf_files = ["/home/harish/Agentic_AI/books/Current_Essentials_of_Medicine.pdf"]  # Example PDF file paths
for pdf_file in pdf_files:
    process_pdf(pdf_file, chunking_strategy="by_title")  # Use either chunking_strategy'by_title' or 'by_similarity'


Batches: 100%|██████████| 167/167 [02:45<00:00,  1.01it/s]


Embedding generation took 165.52 seconds
Embeddings saved for Current_Essentials_of_Medicine.pdf
Text chunks saved for Current_Essentials_of_Medicine.pdf
