In [None]:
from typing import List
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from sklearn.metrics.pairwise import cosine_similarity
import pdfplumber
import random
import numpy as np
import requests
from bs4 import BeautifulSoup
from googlesearch import search
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser, PydanticToolsParser
import chardet
import re
from io import BytesIO
from docx import Document as DocxDocument

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF, removing headers and footers."""
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text

def embed_texts(texts: List[str]) -> np.ndarray:
    """Embed a list of texts using HuggingFace embeddings."""
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    return embedding_model.embed_documents(texts)

def create_vector_store_from_pdf(pdf_path: str):
    """Create a FAISS vector store from a given PDF."""
    text = extract_text_from_pdf(pdf_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_text(text)
    documents = [Document(page_content=chunk) for chunk in texts]
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    vector_store = FAISS.from_documents(documents, embedding_model)
    return vector_store

def retrieve(query: str, pdf_path: str):
    """Retrieve relevant documents from the vector store created from the given PDF."""
    vector_store = create_vector_store_from_pdf(pdf_path)
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    encoded_query = embedding_model.embed_query(query)
    return vector_store.similarity_search_by_vector(encoded_query, k=5)

class SubQuery(BaseModel):
    """Extracts multiple sub-queries from a user query for retrieval."""
    sub_queries: list[str] = Field(..., description="List of highly specific database queries.")

def retrieve_results(query: str, pdf_path: str):
    """Converts a user query into sub-queries and retrieves relevant documents."""
    system_prompt = """You are an expert at query decomposition.
    Your task is to break a user question into multiple highly specific sub-queries
    that must be answered to fully respond to the original question.
    Ensure sub-queries are specific and relevant to the context.
    """
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{question}"),
    ])

    llm = ChatGroq(groq_api_key=groq_api_key, model="llama3-70b-8192", temperature=0.05)
    llm_with_tools = llm.bind_tools([SubQuery])
    parser = PydanticToolsParser(tools=[SubQuery])
    query_analyzer = prompt | llm_with_tools | parser
    response = query_analyzer.invoke({"question": query})

    sub_queries = []
    for item in response:
        sub_queries.extend(item.sub_queries)

    results = {}
    for sub_query in sub_queries:
        retrieved_docs = retrieve(sub_query, pdf_path)  # Call the retrieval function
        results[sub_query] = retrieved_docs

    return results

def get_answer(query: str, pdf_path: str, groq_api_key: str) -> str:
    """Retrieve the most relevant documents using sub-queries and generate an answer."""
    retrieved_results = retrieve_results(query, pdf_path)
    doc_context = '\n---\n'.join([
        f"Sub-Query: {sub_query}\nResults:\n" + '\n'.join([doc.page_content for doc in docs if doc.page_content.strip()])
        for sub_query, docs in retrieved_results.items()
    ])
    
    prompt = PromptTemplate(template="""
        You are an intelligent chatbot answering legal document-related queries.
        Answer accurately using only the provided sub-queries and their corresponding answers given as contexts.
        If no relevant information is found, state that no relevant information is available.

        CONTEXT: {context}\nQUESTION: {question}\nFINAL ANSWER:
    """, input_variables=["context", "question"])

    return LLMChain(llm=ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05), prompt=prompt).run(context=doc_context, question=query)
