In [None]:
import os
import faiss
import numpy as np
import gradio as gr
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate


# Load DeepSeek Model via Hugging Face Hub
repo_id = "deepseek-ai/DeepSeek-V3"
client = HuggingFaceHub(
    repo_id=repo_id, 
    model_kwargs={"temperature": 0.5, "top_k": 10}, 
    huggingfacehub_api_token = #add your token here
)

# Load the PDF
def load_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return "\n".join([doc.page_content for doc in documents])

# Split text into chunks
def split_text(text, chunk_size=500, overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return text_splitter.split_text(text)

# Function to embed text using a pre-trained sentence transformer
def embed_text(chunks):
    model = SentenceTransformer("all-MiniLM-L6-v2")  # Small & fast embedding model
    embeddings = model.encode(chunks, convert_to_tensor=True)
    return embeddings, model

# Create and save FAISS Index
def build_faiss_index(embeddings, index_path="faiss_index"):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.cpu().detach().numpy())  # Convert to NumPy array
    faiss.write_index(index, index_path)  # Save index
    return index

# Load FAISS Index if available
def load_faiss_index(file_path="faiss_index"):
    return faiss.read_index(file_path)

# Search for relevant chunks using FAISS
import numpy as np

def retrieve_relevant_chunks(query, index, text_chunks, embed_model, top_k=3, min_length=30, max_words=100, similarity_threshold=0.5):
    """
    Retrieves relevant text chunks while filtering out irrelevant or short content.
    """
    if index.ntotal == 0:
        return ["No relevant context found."]

    # Get query embedding
    query_embedding = embed_model.encode([query], convert_to_tensor=True).cpu().detach().numpy().astype(np.float32)

    # FAISS search
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i, distance in zip(indices[0], distances[0]):
        if i < len(text_chunks):  # Ensure valid index
            chunk = text_chunks[i].strip()

            # Check similarity (Lower distance = Higher similarity)
            similarity_score = 1 / (1 + distance)  # Normalize to [0,1]

            if similarity_score >= similarity_threshold and len(chunk) > min_length:
                # Limit chunk to `max_words`
                words = chunk.split()[:max_words]
                relevant_chunks.append(" ".join(words))

    return relevant_chunks if relevant_chunks else ["No relevant context found."]



# Generate answer using DeepSeek-LLM
def generate_answer(question, index, text_chunks, embed_model):
    generic_questions = ["hello", "hi", "hey", "how are you?", "what's up?", "good morning", "good evening"]

    # Handle generic greetings separately
    if question.lower().strip() in generic_questions:
        return "Hello! How can I assist you today?"

    # Retrieve relevant text
    relevant_chunks = retrieve_relevant_chunks(question, index, text_chunks, embed_model, top_k=2)

    # If no meaningful context is found, prevent junk responses
    if not relevant_chunks or "No relevant context found." in relevant_chunks:
        return "I don't have relevant information in the document to answer this question."

    # Limit context to only the most relevant sections
    context = "\n".join(relevant_chunks[:2])  # Take top 2 relevant chunks

    # Construct refined prompt
    prompt = f"""
    You are an AI assistant. Use the provided context to answer the question concisely and accurately.
    
    Context: 
    {context}
    
    Question: {question}
    
    Answer:"""
    
    # Generate response
    response = client.invoke(prompt)
    
    return response.split("Answer:")[-1].strip() # Remove trailing spaces



# File path to the PDF
pdf_path = #add your pdf path here

# Load and process the PDF
text = load_pdf(pdf_path)
text_chunks = split_text(text)

# Check if FAISS index exists; if not, create it
index_path = "faiss_index"
if os.path.exists(index_path):
    faiss_index = load_faiss_index(index_path)
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # Ensure embed_model is available
else:
    embeddings, embed_model = embed_text(text_chunks)
    faiss_index = build_faiss_index(embeddings, index_path)


# Define chatbot function for Gradio
def chatbot(query, history=None):  # Added history argument to prevent TypeError
    return generate_answer(query, faiss_index, text_chunks, embed_model)

# Create Gradio Chatbot Interface
interface = gr.ChatInterface(
    chatbot,
    title="PDF-based Chatbot with DeepSeek",
    description="Ask questions based on the provided PDF book. The chatbot will only answer from the book's content.",
)

# Launch the chatbot
interface.launch(share=True)

