In [4]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Read Groq API key from environment variable
groq_api_key = os.getenv('GROQ_API_KEY')


In [5]:
pip install langchain langchain-community pdfplumber numpy scikit-learn faiss-cpu requests langchain-groq googlesearch-python beautifulsoup4 langchain-experimental sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:

from typing import List
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from sklearn.metrics.pairwise import cosine_similarity
import pdfplumber
import random
import numpy as np
import requests
from bs4 import BeautifulSoup
from googlesearch import search

# This fuction will return a list of relevent link adresses for the given query
def web_search(query: str, max_results: int = 3) -> List[str]:
    """Perform web search and return top results."""
    try:
        return list(search(query, num_results=max_results))[:max_results]
    except Exception as e:
        print(f"Web search error: {e}")
        return []



#From the links this unction will extract contents
def fetch_content_from_link(link: str) -> str:
    """Fetch and clean text content from a webpage."""
    try:
        if not link.startswith(('http://', 'https://')):
            link = f'https://{link}'
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return ' '.join(soup.get_text().split())
    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return ""

#Function to get live text from a given pdf
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF, removing headers and footers."""
    def get_header_footer(pdf_path: str, threshold: float = 0.71):
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            random_page_nos = random.sample(range(5, total_pages), 10) if total_pages >= 15 else list(range(total_pages))

            def compute_lines(is_header: bool):
                avg_similarity, lines_count = 1, -1
                while avg_similarity > threshold and lines_count < 4:
                    lines_count += 1
                    five_lines = [pdf.pages[page_no].extract_text().split('\n')[lines_count if is_header else -(lines_count + 1)]
                                  for page_no in random_page_nos]
                    avg_similarity = np.mean(cosine_similarity(embed_texts(five_lines))[np.triu_indices(len(five_lines), k=1)])
                return lines_count

            return compute_lines(True), compute_lines(False)

    header_lines, footer_lines = get_header_footer(pdf_path)
    with pdfplumber.open(pdf_path) as pdf:
        return '\n'.join(['\n'.join(page.extract_text().split('\n')[header_lines:-(footer_lines + 1)]) for page in pdf.pages])


#This will return embedding for a given text
def embed_texts(texts: List[str]) -> np.ndarray:
    """Embed a list of texts using HuggingFace embeddings."""
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    return embedding_model.embed_documents(texts)

#retriever class
class WebEnhancedHydeRetriever:
    def __init__(self, vectorstore, embedding_model, hyde_chain):
        self.vectorstore = vectorstore
        self.embedding_model = embedding_model
        self.hyde_chain = hyde_chain

    def get_relevant_documents(self, query: str, k: int = 5):
        links = web_search(query)
        web_results = '\n'.join([fetch_content_from_link(link) for link in links])[:3000]
        refined_query = self.hyde_chain.invoke({"question": query, "context": web_results})["text"] if web_results else query
        return self.vectorstore.similarity_search_by_vector(self.embedding_model.embed_query(refined_query), k=k)

#retriever
def create_retriever(pdf_path: str, groq_api_key: str) -> WebEnhancedHydeRetriever:
    """Initialize and return a WebEnhancedHydeRetriever instance."""
    text = extract_text_from_pdf(pdf_path)
    docs = [Document(page_content=t) for t in RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text) if t.strip()]
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    vectorstore = FAISS.from_documents(docs, embedding_model)
    hyde_llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192", temperature=0.1)
    hyde_chain = LLMChain(llm=hyde_llm, prompt=PromptTemplate(template="""WEB CONTEXT: {context}\nQUESTION: {question}\nHYPOTHETICAL ANSWER:""", input_variables=["question", "context"]))
    return WebEnhancedHydeRetriever(vectorstore, embedding_model, hyde_chain)

#answer
def get_answer(query: str, retriever: WebEnhancedHydeRetriever, groq_api_key: str) -> str:
    """Retrieve the most relevant documents and generate an answer."""
    doc_context = '\n---\n'.join([doc.page_content for doc in retriever.get_relevant_documents(query) if doc.page_content.strip()])
    prompt = PromptTemplate(template="""
        You are an intelligent chatbot answering legal document-related queries.
        Answer accurately using only the provided context.
        If no relevant information is found, state that no relevant information is available.

        CONTEXT: {context}\nQUESTION: {question}\nFINAL ANSWER:
    """, input_variables=["context", "question"])

    return LLMChain(llm=ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05), prompt=prompt).run(context=doc_context, question=query)
