<a href="https://colab.research.google.com/github/HibaAp/RAG-KnowledgeBase-System/blob/main/DailyUpdates/24_02_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install langchain langchain-community pdfplumber numpy scikit-learn faiss-cpu requests langchain-groq googlesearch-python beautifulsoup4 langchain-experimental sentence_transformers

Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.2.4-py3-none-any.whl.metadata (3.0 kB)
Collecting googlesearch-python
  Downloading googlesearch_python-1.3.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain-experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.0-py3-none-any.whl.

In [None]:
groq_api_key = "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"


In [None]:
from typing import List
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from sklearn.metrics.pairwise import cosine_similarity
import pdfplumber
import random
import numpy as np
import requests
from bs4 import BeautifulSoup
from googlesearch import search

# This fuction will return a list of relevent link adresses for the given query
def web_search(query: str, max_results: int = 3) -> List[str]:
    """Perform web search and return top results."""
    try:
        return list(search(query, num_results=max_results))[:max_results]
    except Exception as e:
        print(f"Web search error: {e}")
        return []



#From the links this unction will extract contents
def fetch_content_from_link(link: str) -> str:
    """Fetch and clean text content from a webpage."""
    try:
        if not link.startswith(('http://', 'https://')):
            link = f'https://{link}'
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return ' '.join(soup.get_text().split())
    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return ""

#Function to get live text from a given pdf
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF, removing headers and footers."""
    def get_header_footer(pdf_path: str, threshold: float = 0.71):
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            random_page_nos = random.sample(range(5, total_pages), 10) if total_pages >= 15 else list(range(total_pages))

            def compute_lines(is_header: bool):
                avg_similarity, lines_count = 1, -1
                while avg_similarity > threshold and lines_count < 4:
                    lines_count += 1
                    five_lines = [pdf.pages[page_no].extract_text().split('\n')[lines_count if is_header else -(lines_count + 1)]
                                  for page_no in random_page_nos]
                    avg_similarity = np.mean(cosine_similarity(embed_texts(five_lines))[np.triu_indices(len(five_lines), k=1)])
                return lines_count

            return compute_lines(True), compute_lines(False)

    header_lines, footer_lines = get_header_footer(pdf_path)
    with pdfplumber.open(pdf_path) as pdf:
        return '\n'.join(['\n'.join(page.extract_text().split('\n')[header_lines:-(footer_lines + 1)]) for page in pdf.pages])


#This will return embedding for a given text
def embed_texts(texts: List[str]) -> np.ndarray:
    """Embed a list of texts using HuggingFace embeddings."""
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    return embedding_model.embed_documents(texts)

#retriever class
class WebEnhancedHydeRetriever:
    def __init__(self, vectorstore, embedding_model, hyde_chain):
        self.vectorstore = vectorstore
        self.embedding_model = embedding_model
        self.hyde_chain = hyde_chain

    def get_relevant_documents(self, query: str, k: int = 5):
        links = web_search(query)
        web_results = '\n'.join([fetch_content_from_link(link) for link in links])[:3000]
        refined_query = self.hyde_chain.invoke({"question": query, "context": web_results})["text"] if web_results else query
        return self.vectorstore.similarity_search_by_vector(self.embedding_model.embed_query(refined_query), k=k)

#retriever
def create_retriever(pdf_path: str, groq_api_key: str) -> WebEnhancedHydeRetriever:
    """Initialize and return a WebEnhancedHydeRetriever instance."""
    text = extract_text_from_pdf(pdf_path)
    docs = [Document(page_content=t) for t in RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text) if t.strip()]
    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
    vectorstore = FAISS.from_documents(docs, embedding_model)
    hyde_llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192", temperature=0.1)
    hyde_chain = LLMChain(llm=hyde_llm, prompt=PromptTemplate(template="""WEB CONTEXT: {context}\nQUESTION: {question}\nHYPOTHETICAL ANSWER:""", input_variables=["question", "context"]))
    return WebEnhancedHydeRetriever(vectorstore, embedding_model, hyde_chain)

#answer
def get_answer(query: str, retriever: WebEnhancedHydeRetriever, groq_api_key: str) -> str:
    """Retrieve the most relevant documents and generate an answer."""
    doc_context = '\n---\n'.join([doc.page_content for doc in retriever.get_relevant_documents(query) if doc.page_content.strip()])
    prompt = PromptTemplate(template="""
        You are an intelligent chatbot answering legal document-related queries.
        Answer accurately using only the provided context.
        If no relevant information is found, state that no relevant information is available.

        CONTEXT: {context}\nQUESTION: {question}\nFINAL ANSWER:
    """, input_variables=["context", "question"])

    return LLMChain(llm=ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05), prompt=prompt).run(context=doc_context, question=query)


In [None]:
query="How is the vertical inclination of the passing-beam verified?"

In [None]:
retriever=create_retriever("/content/R048r12e.pdf",groq_api_key)

  embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': False})
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

  hyde_chain = LLMChain(llm=hyde_llm, prompt=PromptTemplate(template="""WEB CONTEXT: {context}\nQUESTION: {question}\nHYPOTHETICAL ANSWER:""", input_variables=["question", "context"]))


In [None]:
get_answer(query,retriever,groq_api_key)



Error fetching https://wiki.unece.org/download/attachments/190087209/SLR-59-02-Rev.1_%28SLR%29_Draft%20GRE-2020-08-REV2%20adaptation%20to%20%27Red%20Box%27%20agreed%20at%20GRE87_SLR-59.docx?api=v2: 503 Server Error: Service Unavailable for url: https://wiki.unece.org/download/attachments/190087209/SLR-59-02-Rev.1_%28SLR%29_Draft%20GRE-2020-08-REV2%20adaptation%20to%20%27Red%20Box%27%20agreed%20at%20GRE87_SLR-59.docx?api=v2


  return LLMChain(llm=ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05), prompt=prompt).run(context=doc_context, question=query)


'Based on the provided context, the vertical inclination of the passing-beam is not explicitly mentioned. However, the vertical inclination of the dipped-beam is mentioned in several sections.\n\nAccording to paragraph 6.22.6.4, the measuring procedure for the dipped-beam inclination involves adjusting the initial setting of beam orientation, and the readings are accurate to within ±0.2 mrad (±0.02 per cent inclination).\n\nAdditionally, paragraph 2.2 defines the dipped-beam inclination as the angle, expressed in milliradians, between the direction of the beam towards a characteristic point on the horizontal part of the cut-off in the luminous distribution of the headlamp and the horizontal plane. Alternatively, it can be expressed as the tangent of that angle, in percentage inclination.\n\nNo relevant information is available regarding the verification of the vertical inclination of the passing-beam.'

In [None]:
def web_search(query: str, max_results: int = 3) -> List[str]:
    """Perform web search and return top results."""
    try:
        return list(search(query, num_results=max_results))[:max_results]
    except Exception as e:
        print(f"Web search error: {e}")
        return []



#From the links this unction will extract contents
def fetch_content_from_link(link: str) -> str:
    """Fetch and clean text content from a webpage."""
    try:
        if not link.startswith(('http://', 'https://')):
            link = f'https://{link}'
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return ' '.join(soup.get_text().split())
    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return ""

In [None]:
links=web_search(query)

In [None]:
web_results = '\n'.join([fetch_content_from_link(link) for link in links])[:3000]



Error fetching https://wiki.unece.org/download/attachments/190087209/SLR-59-02-Rev.1_%28SLR%29_Draft%20GRE-2020-08-REV2%20adaptation%20to%20%27Red%20Box%27%20agreed%20at%20GRE87_SLR-59.docx?api=v2: 503 Server Error: Service Unavailable for url: https://wiki.unece.org/download/attachments/190087209/SLR-59-02-Rev.1_%28SLR%29_Draft%20GRE-2020-08-REV2%20adaptation%20to%20%27Red%20Box%27%20agreed%20at%20GRE87_SLR-59.docx?api=v2


In [None]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
def retrieve(query,vector_db_path="/content/drive/MyDrive/vector_database"):
  embedding_model = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-en",
    encode_kwargs={'normalize_embeddings': False}
  )
  encoded_query=embedding_model.embed_query(query)
  vectorstore = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True)
  return vectorstore.similarity_search_by_vector(encoded_query, k=5)


In [None]:
retrieved_docs=retrieve(query)

In [None]:
retrieved_docs

[Document(id='5a415781-c9c5-4710-a8ac-95bc24696bf5', metadata={}, page_content='according to Annex 10 of Regulation No. 123.\n6.22.6.4. Measuring procedure:\nAfter adjustment of the initial setting of beam orientation, the vertical\ninclination of the passing-beam or, when applicable, the vertical inclinations\nof all the different lighting units that provide or contribute to the cut-off(s)\naccording to paragraph 6.22.6.1.2.1. above of the basic passing-beam, shall\nbe verified for all loading conditions of the vehicle in accordance with the\nspecifications in paragraphs 6.2.6.3.1. and 6.2.6.3.2. of this Regulation.\n6.22.7. Electrical connections\n6.22.7.1. Main-beam lighting (if provided by the AFS)\n6.22.7.1.1. The lighting units for the main-beam may be activated either simultaneously\nor in pairs. For changing over from the dipped-beam to the main-beam at\nleast one pair of lighting units for the main-beam shall be activated. For\nchanging over from the main-beam to the dipped-be