In [None]:
#extract and store embedding

In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from unstructured.partition.pdf import partition_pdf  # Use partition_pdf for PDF processing
from unstructured.chunking.title import chunk_by_title  # Import chunking strategy (by title or by similarity)
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle
import time

# Set directory to save embeddings and text chunks
EMBEDDING_DIR = "/home/harish/Agentic_AI/embeddings"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/text_chunks"  # Directory for text chunks
os.makedirs(EMBEDDING_DIR, exist_ok=True)
os.makedirs(TEXT_CHUNKS_DIR, exist_ok=True)

# Load SBERT model
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

# Load the model before running the code
embedding_model = load_embedding_model()

# Function to extract text from PDFs using partition_pdf
def extract_text_from_pdf(pdf_file):
    # Partition the PDF document using partition_pdf (with "fast" strategy for extractable text)
    elements = partition_pdf(pdf_file, strategy="fast")  # Adjust strategy if needed ("hi_res", "ocr_only")
    
    # Extract the text content from the elements
    document_text = [element.text for element in elements if hasattr(element, 'text')]
    return document_text

# Function to chunk text based on title or similarity
def chunk_text(elements, strategy="by_title"):
    # Choose the chunking strategy (by title or by similarity)
    if strategy == "by_title":
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements)
    elif strategy == "by_similarity":
        from unstructured.chunking.basic import chunk_elements
        chunks = chunk_elements(elements, strategy="by_similarity", similarity_threshold=0.7)
    else:
        raise ValueError("Unknown chunking strategy. Choose 'by_title' or 'by_similarity'.")
    
    return chunks

# Function to generate embeddings using SBERT (batch processing)
def get_embeddings_batch(texts):
    embeddings = embedding_model.encode(texts, show_progress_bar=True, batch_size=16)  # Batch processing
    return embeddings

# Function to process the PDF files
def process_pdf(file_path, chunking_strategy="by_title"):
    # Read the PDF using partition_pdf
    elements = partition_pdf(file_path, strategy="fast")  # Use partition_pdf to extract text
    
    # Chunk the extracted elements by title or similarity
    chunks = chunk_text(elements, strategy=chunking_strategy)
    
    # If chunks are too large, further split them using RecursiveCharacterTextSplitter
    all_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    for chunk in chunks:
        sub_chunks = text_splitter.split_text(chunk.text)
        all_chunks.extend(sub_chunks)
    
    # Batch processing of embeddings
    start_time = time.time()
    embeddings = get_embeddings_batch(all_chunks)
    print(f"Embedding generation took {time.time() - start_time:.2f} seconds")
    
    embeddings = np.array(embeddings, dtype=np.float32)
    
    # Save embeddings to .npy file
    save_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(file_path)}.npy")
    np.save(save_path, embeddings)
    print(f"Embeddings saved for {os.path.basename(file_path)}")
    
    # Save text chunks to .pkl file
    text_chunks_path = os.path.join(TEXT_CHUNKS_DIR, f"{os.path.basename(file_path)}_chunks.pkl")
    with open(text_chunks_path, 'wb') as f:
        pickle.dump(all_chunks, f)
    print(f"Text chunks saved for {os.path.basename(file_path)}")

# Process PDF files (adjust the paths to the PDFs)
pdf_files = ["/home/harish/Agentic_AI/books/Current_Essentials_of_Medicine.pdf"]  # Example PDF file paths
for pdf_file in pdf_files:
    process_pdf(pdf_file, chunking_strategy="by_title")  # Use either chunking_strategy'by_title' or 'by_similarity'


#search and enhancement via llm

In [5]:
import os
import numpy as np
import faiss
import pickle
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv

# Directory for embeddings and text chunks
EMBEDDING_DIR = "/home/harish/Agentic_AI/embedding_chunk_bytitle"
TEXT_CHUNKS_DIR = "/home/harish/Agentic_AI/chunked_be_title"

# Load environment variables (API keys, etc.)
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Initialize ChatGroq LLM
llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="Llama3-8b-8192", temperature=0)

def load_embeddings():
    """
    Loads stored embeddings (.npy) and text chunks from their respective directories.
    """
    embeddings_list = []
    text_chunks = []

    files_found = [f for f in os.listdir(EMBEDDING_DIR) if f.endswith(".npy")]
    if not files_found:
        raise FileNotFoundError("⚠️ No `.npy` embedding files found!")

    for file in files_found:
        file_path = os.path.join(EMBEDDING_DIR, file)
        text_file = file.replace(".npy", "_chunks.pkl")
        text_path = os.path.join(TEXT_CHUNKS_DIR, text_file)

        try:
            embed = np.load(file_path)
            embeddings_list.append(embed)

            with open(text_path, "rb") as f:
                texts = pickle.load(f)
                text_chunks.extend(texts)
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")
            continue

    if not embeddings_list:
        raise ValueError("❌ No valid embeddings found.")

    embeddings = np.vstack(embeddings_list)  # Stack embeddings
    return embeddings, text_chunks

def create_faiss_index(embeddings):
    """
    Creates a FAISS index for fast similarity search.
    """
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance metric for FAISS
    index.add(embeddings)  # Add embeddings to FAISS
    return index

# Load embeddings and create FAISS index
try:
    embeddings, text_chunks = load_embeddings()
    faiss_index = create_faiss_index(embeddings)
    print("✅ FAISS index created with", embeddings.shape[0], "entries.")
except Exception as e:
    print(f"❌ Error: {e}")

from sentence_transformers import SentenceTransformer

# Initialize your embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use the model that fits your use case

def search_faiss_index(query, faiss_index, embeddings, text_chunks, k=5):
    """
    Perform a search on the FAISS index for the most similar embeddings to the query.
    """
    # Convert query to embedding using the same model
    query_embedding = embedding_model.encode([query])  # Make sure embedding_model is defined

    # Perform the FAISS search
    distances, indices = faiss_index.search(np.array(query_embedding).astype(np.float32), k)
    
    # Retrieve the most similar text chunks
    top_k_chunks = [text_chunks[i] for i in indices[0]]
    
    return top_k_chunks, distances[0]


# def create_chatgroq_prompt(top_k_chunks, query):
#     """
#     Creates a formatted prompt for ChatGroq using retrieved chunks.
#     """
#     context = "\n".join(top_k_chunks)
#     prompt = ChatPromptTemplate.from_template(
#         """
#         Answer the following question based on the provided context:
#         <context>
#         {context}
#         </context>
#         Question: {input}
#         """
#     )
    
#     formatted_prompt = prompt.format(context=context, input=query)
#     return formatted_prompt
def create_chatgroq_prompt(top_k_chunks, query):
    """
    Creates a formatted prompt for ChatGroq using retrieved chunks and mandatory instructions.
    """
    # Join the top K chunks into a single context string
    context = "\n".join(top_k_chunks)
    
    # Construct the prompt with mandatory instructions for MedBot
    prompt = ChatPromptTemplate.from_template(
        """
        Instructions:
        - Provide accurate, clear, and medically relevant information based on the provided context.
        - If the information is unclear or missing, indicate that clearly.
        - Avoid giving personal medical advice; the response should be informative and factual.
        - Ensure medical terminology is explained where necessary, using simple language when possible.
        - Always clarify when a response is based on general information and not professional medical advice.
        
        Context:
        {context}

        Question: {input}

        Please provide your response below:
        """
    )
    
    # Format the prompt with the provided context and input query
    formatted_prompt = prompt.format(context=context, input=query)
    return formatted_prompt



def query_chatgroq_with_context(query, faiss_index, embeddings, text_chunks, top_k=5):
    """
    Query ChatGroq with the enhanced context retrieved from FAISS search.
    """
    # Step 1: Retrieve the relevant chunks from FAISS
    top_k_chunks, distances = search_faiss_index(query, faiss_index, embeddings, text_chunks, k=top_k)

    # Step 2: Prepare the prompt for ChatGroq
    formatted_prompt = create_chatgroq_prompt(top_k_chunks, query)

    # Step 3: Query ChatGroq with the enhanced prompt
    response = llm.invoke(formatted_prompt)  # Use the correct method to invoke

    # Debugging: Print the response and its type
    print(f"Response from ChatGroq: {response}")
    print(f"Response type: {type(response)}")  # Check the type of response

    # Assuming the answer is stored in the 'content' attribute
    return response.content  # Or adjust if a different attribute is used

# Example query
query = "give me information like( Essentials of Diagnosis, Differential Diagnosis, Treatment) Amphetamines, Ecstasy, Cocaine"

# Query ChatGroq with relevant context from FAISS
answer = query_chatgroq_with_context(query, faiss_index, embeddings, text_chunks, top_k=5)

print(f"Answer from ChatGroq: {answer}")



✅ FAISS index created with 2668 entries.
Response from ChatGroq: content="Here is the information you requested for Amphetamines, Ecstasy, and Cocaine:\n\n**Essentials of Diagnosis**\n\n* Substance intoxication and/or withdrawal (e.g., amphetamines, ecstasy, cocaine) can present with a wide range of symptoms, including:\n\t+ Psychiatric symptoms: agitation, anxiety, paranoia, hallucinations, and mood disturbances\n\t+ Neurological symptoms: tremors, muscle rigidity, and seizures\n\t+ Cardiovascular symptoms: tachycardia, hypertension, and cardiac arrhythmias\n\t+ Gastrointestinal symptoms: nausea, vomiting, and abdominal pain\n\t+ Respiratory symptoms: tachypnea and respiratory depression\n\n**Differential Diagnosis**\n\n* For amphetamines:\n\t+ Substance intoxication and/or withdrawal (e.g., cocaine, ecstasy)\n\t+ Medication use (e.g., methylphenidate, thyroxine)\n\t+ Endocrinopathies (e.g., hyperthyroidism, Cushing's syndrome)\n\t+ Central nervous system neoplasm\n\t+ Complex partial