<a href="https://colab.research.google.com/github/HibaAp/RAG-KnowledgeBase-System/blob/main/DailyUpdates/31_01_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
groq_api_key = "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"


In [2]:
pip install langchain langchain-community pdfplumber numpy scikit-learn faiss-cpu requests langchain-groq googlesearch-python beautifulsoup4 langchain-experimental sentence_transformers



In [3]:
!pip install sentence_transformers



In [4]:
from bs4 import BeautifulSoup
import requests
def get_retrievers(pdf_path):
    import warnings
    warnings.filterwarnings("ignore")
    import random
    import pdfplumber
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    from langchain.embeddings import HuggingFaceBgeEmbeddings
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_community.document_loaders import PyPDFLoader
    from langchain.docstore.document import Document
    from langchain_community.vectorstores import FAISS


    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en",
                                               encode_kwargs={'normalize_embeddings': False})

    def embed_texts(texts):
        return embedding_model.embed_documents(texts)

    def get_header_footer(pdf_path, threshold=0.71):
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            if total_pages >= 15:
                random_page_nos = random.sample(range(5, total_pages), 10)
            else:
                random_page_nos = list(range(total_pages))

            avg_similarity = 1
            header_lines = -1
            while avg_similarity > threshold and header_lines < 4:
                header_lines += 1
                five_lines = []
                for page_no in random_page_nos:
                    lines = pdf.pages[page_no].extract_text().split('\n')
                    if len(lines) > header_lines:
                        five_lines.append(lines[header_lines])
                similarities = cosine_similarity(embed_texts(five_lines))
                avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])

            avg_similarity = 1
            footer_lines = -1
            while avg_similarity > threshold and footer_lines < 4:
                footer_lines += 1
                five_lines = []
                for page_no in random_page_nos:
                    lines = pdf.pages[page_no].extract_text().split('\n')
                    if len(lines) > footer_lines:
                        five_lines.append(lines[-(footer_lines + 1)])
                similarities = cosine_similarity(embed_texts(five_lines))
                avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])
            return header_lines, footer_lines

    def extract_text(pdf_path):
        header_lines, footer_lines = get_header_footer(pdf_path)
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    lines = page_text.split('\n')
                    if lines:
                        page_text = '\n'.join(lines[header_lines:-(footer_lines + 1)])
                        text += page_text + '\n'
            return text

    text = extract_text(pdf_path)

    def get_vectorstore1():
        texts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)
        docs = [Document(text) for text in texts if text.strip()]
        vectorstore = FAISS.from_documents(docs, embedding_model)
        return vectorstore

    def get_vectorstore2():
        texts = RecursiveCharacterTextSplitter(chunk_size=6000, chunk_overlap=400).split_text(text)
        docs = [Document(text) for text in texts if text.strip()]
        vectorstore = FAISS.from_documents(docs, embedding_model)
        return vectorstore

    retriever1 = get_vectorstore1().as_retriever(search_kwargs={"k": 6})
    retriever2 = get_vectorstore2().as_retriever(search_kwargs={"k": 3})
    return retriever1, retriever2
def web_search(query, max_results=3):
    """Perform actual web search using googlesearch-python"""
    from googlesearch import search
    # Use num_results instead of stop
    results = list(search(query, num_results=max_results))
    return results[:max_results]

def fetch_content_from_link(link):
    try:
        # Validate URL scheme
        if not link.startswith(('http://', 'https://')):
            link = f'https://{link}'  # Attempt to fix missing scheme
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        raw_text = soup.get_text()
        cleaned_text = ' '.join(raw_text.split())
        return cleaned_text
    except Exception as e:
        print(f"Error fetching {link}: {str(e)}")
        return ""  # Return empty string for failed requests



#print(results_str)


def get_answer(query, retriever1, retriever2, groq_api_key):
    from langchain.prompts import PromptTemplate
    from langchain_groq import ChatGroq
    from langchain.chains import LLMChain

    # 1. Perform web search
    links=web_search(query)
    web_results = results_str = "\n".join([f"{i+1}. {fetch_content_from_link(link)}" for i, link in enumerate(links)])


    # 2. Retrieve document content
    doc_results1 = retriever1.get_relevant_documents(query)
    doc_results2 = retriever2.get_relevant_documents(query)
    doc_context = "\n---\n".join([doc.page_content for doc in doc_results1 + doc_results2])

    # 3. Prepare combined context
    combined_context = f"""
    WEB SEARCH RESULTS:
    {web_results}

    DOCUMENT CONTENT:
    {doc_context}
    """
    if len(combined_context)>4000:
      combined_context=combined_context[:4000]

    # 4. Create LLM chain with combined context
    llm = ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05)

    prompt_template = """
    Analyze and synthesize information from both web results and document content to answer
    the question. Follow these steps:
    1. Identify relevant information from web results
    2. Find supporting/contradictory information in documents
    3. Combine insights from both sources
    4. If sources conflict, note this and prioritize document content
    5. Just give the final answer . I dont want you to restate the question, or web results
    CONTEXT:
    {context}

    QUESTION: {question}

    FINAL ANSWER:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(context=combined_context, question=query)

def compare_answers(query, retriever1, retriever2, retriever3, retriever4, groq_api_key):
    # Get answers from both document sets (automatically includes web search)
    answer1 = get_answer(query, retriever1, retriever2, groq_api_key)
    answer2 = get_answer(query, retriever3, retriever4, groq_api_key)

    # Generate comparison
    from langchain_groq import ChatGroq
    llm = ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05)

    comparison_prompt = f"""
    Compare two answers to the same question, considering both were generated using:
    - Web search results
    - Different document sets

    QUESTION: {query}

    ANSWER 1: {answer1}

    ANSWER 2: {answer2}

    Analyze:
    1. Key similarities and differences
    2. Potential reasons for discrepancies
    3. Which answer better synthesizes web+document info
    4. Any missing information in either answer
    """

    return llm.invoke(comparison_prompt).content

In [5]:
retriever1,retriever2=get_retrievers("/content/geometry1-projective.pdf")

  embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en",


In [6]:
query="What is projective Space?"

In [7]:
get_answer(query,retriever1,retriever2,groq_api_key)

Error fetching https:///search?num=5: Invalid URL 'https:///search?num=5': No host supplied


  doc_results1 = retriever1.get_relevant_documents(query)
  chain = LLMChain(llm=llm, prompt=prompt)
  return chain.run(context=combined_context, question=query)


'A projective space is a mathematical concept that originated from the visual effect of perspective, where parallel lines seem to meet at infinity. It can be viewed as the extension of a Euclidean space or an affine space with points at infinity, in such a way that there is one point at infinity of each direction of parallel lines. Alternatively, it can be defined as the set of vector lines in a vector space of higher dimension, or as a sphere in which antipodal points are identified.'

In [8]:
def generate_HyDoc(query,groq_api_key):
    from langchain_groq import ChatGroq
    llm = ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05)


In [9]:
from bs4 import BeautifulSoup
import requests
def get_retrievers(pdf_path):
    import warnings
    warnings.filterwarnings("ignore")
    import random
    import pdfplumber
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    from langchain.embeddings import HuggingFaceBgeEmbeddings
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_community.document_loaders import PyPDFLoader
    from langchain.docstore.document import Document
    from langchain_community.vectorstores import FAISS


    embedding_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en",
                                               encode_kwargs={'normalize_embeddings': False})

    def embed_texts(texts):
        return embedding_model.embed_documents(texts)

    def get_header_footer(pdf_path, threshold=0.71):
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            if total_pages >= 15:
                random_page_nos = random.sample(range(5, total_pages), 10)
            else:
                random_page_nos = list(range(total_pages))

            avg_similarity = 1
            header_lines = -1
            while avg_similarity > threshold and header_lines < 4:
                header_lines += 1
                five_lines = []
                for page_no in random_page_nos:
                    lines = pdf.pages[page_no].extract_text().split('\n')
                    if len(lines) > header_lines:
                        five_lines.append(lines[header_lines])
                similarities = cosine_similarity(embed_texts(five_lines))
                avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])

            avg_similarity = 1
            footer_lines = -1
            while avg_similarity > threshold and footer_lines < 4:
                footer_lines += 1
                five_lines = []
                for page_no in random_page_nos:
                    lines = pdf.pages[page_no].extract_text().split('\n')
                    if len(lines) > footer_lines:
                        five_lines.append(lines[-(footer_lines + 1)])
                similarities = cosine_similarity(embed_texts(five_lines))
                avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])
            return header_lines, footer_lines

    def extract_text(pdf_path):
        header_lines, footer_lines = get_header_footer(pdf_path)
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    lines = page_text.split('\n')
                    if lines:
                        page_text = '\n'.join(lines[header_lines:-(footer_lines + 1)])
                        text += page_text + '\n'
            return text

    text = extract_text(pdf_path)

    hyde_prompt = PromptTemplate(
        input_variables=["question"],
        template="Generate a hypothetical legal answer to: {question} Include relevant laws and keywords."
    )

    # Initialize LLM for HyDE (add to existing code)
    hyde_llm = OpenAI(temperature=0.1)  # Add to top with other imports

    hyde_chain = LLMChain(llm=hyde_llm, prompt=hyde_prompt)

    # Wrap existing embeddings with HyDE
    hyde_embeddings = HypotheticalDocumentEmbedder(
        llm_chain=hyde_chain,
        base_embeddings=embedding_model
    )

    # Modify vectorstore creation
    def get_vectorstore1():
        texts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)
        docs = [Document(text) for text in texts if text.strip()]
        # Use hyde_embeddings instead of base embeddings
        vectorstore = FAISS.from_documents(docs, hyde_embeddings)
        return vectorstore

    def get_vectorstore2():
        texts = RecursiveCharacterTextSplitter(chunk_size=6000, chunk_overlap=400).split_text(text)
        docs = [Document(text) for text in texts if text.strip()]
        # Use hyde_embeddings instead of base embeddings
        vectorstore = FAISS.from_documents(docs, hyde_embeddings)
        return vectorstore

    retriever1 = get_vectorstore1().as_retriever(search_kwargs={"k": 6})
    retriever2 = get_vectorstore2().as_retriever(search_kwargs={"k": 3})
    return retriever1, retriever2
def web_search(query, max_results=3):
    """Perform actual web search using googlesearch-python"""
    from googlesearch import search
    # Use num_results instead of stop
    results = list(search(query, num_results=max_results))
    return results[:max_results]

def fetch_content_from_link(link):
    try:
        # Validate URL scheme
        if not link.startswith(('http://', 'https://')):
            link = f'https://{link}'  # Attempt to fix missing scheme
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        raw_text = soup.get_text()
        cleaned_text = ' '.join(raw_text.split())
        return cleaned_text
    except Exception as e:
        print(f"Error fetching {link}: {str(e)}")
        return ""  # Return empty string for failed requests



#print(results_str)


def get_answer(query, retriever1, retriever2, groq_api_key):
    from langchain.prompts import PromptTemplate
    from langchain_groq import ChatGroq
    from langchain.chains import LLMChain

    # 1. Perform web search
    links=web_search(query)
    web_results = results_str = "\n".join([f"{i+1}. {fetch_content_from_link(link)}" for i, link in enumerate(links)])


    # 2. Retrieve document content
    doc_results1 = retriever1.get_relevant_documents(query)
    doc_results2 = retriever2.get_relevant_documents(query)
    doc_context = "\n---\n".join([doc.page_content for doc in doc_results1 + doc_results2])

    # 3. Prepare combined context
    combined_context = f"""
    WEB SEARCH RESULTS:
    {web_results}

    DOCUMENT CONTENT:
    {doc_context}
    """
    if len(combined_context)>4000:
      combined_context=combined_context[:4000]

    # 4. Create LLM chain with combined context
    llm = ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05)

    prompt_template = """
    Analyze and synthesize information from both web results and document content to answer
    the question. Follow these steps:
    1. Identify relevant information from web results
    2. Find supporting/contradictory information in documents
    3. Combine insights from both sources
    4. If sources conflict, note this and prioritize document content
    5. Just give the final answer . I dont want you to restate the question, or web results
    CONTEXT:
    {context}

    QUESTION: {question}

    FINAL ANSWER:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(context=combined_context, question=query)

def compare_answers(query, retriever1, retriever2, retriever3, retriever4, groq_api_key):
    # Get answers from both document sets (automatically includes web search)
    answer1 = get_answer(query, retriever1, retriever2, groq_api_key)
    answer2 = get_answer(query, retriever3, retriever4, groq_api_key)

    # Generate comparison
    from langchain_groq import ChatGroq
    llm = ChatGroq(groq_api_key=groq_api_key, model='llama3-70b-8192', temperature=0.05)

    comparison_prompt = f"""
    Compare two answers to the same question, considering both were generated using:
    - Web search results
    - Different document sets

    QUESTION: {query}

    ANSWER 1: {answer1}

    ANSWER 2: {answer2}

    Analyze:
    1. Key similarities and differences
    2. Potential reasons for discrepancies
    3. Which answer better synthesizes web+document info
    4. Any missing information in either answer
    """

    return llm.invoke(comparison_prompt).content

In [None]:
!pip install --upgrade langchain-experimental

from langchain_experimental.hypothetical_document_embedder import HypotheticalDocumentEmbedder

In [11]:
!pip install langchain-experimental
# This ensures that the langchain-experimental package is installed in your environment
# The `HypotheticalDocumentEmbedder` class is located in this package.



In [12]:
from langchain_experimental.hypothetical_document_embedder import HypotheticalDocumentEmbedder
# This line imports the necessary class from the correct module.

ModuleNotFoundError: No module named 'langchain_experimental.hypothetical_document_embedder'

In [13]:
from langchain_experimental.hypothetical_document_embedder import HypotheticalDocumentEmbedder
print("Module imported successfully!")


ModuleNotFoundError: No module named 'langchain_experimental.hypothetical_document_embedder'

In [14]:
!pip show langchain_experimental


Name: langchain-experimental
Version: 0.3.4
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain-experimental
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: langchain-community, langchain-core
Required-by: 


In [15]:
from langchain_experimental.document_embedders import HypotheticalDocumentEmbedder


ModuleNotFoundError: No module named 'langchain_experimental.document_embedders'

In [None]:
!pip install langchain_experimental


In [None]:
#!pip install --upgrade langchain


In [29]:
from bs4 import BeautifulSoup
import requests
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
from langchain_groq import ChatGroq

def get_retrievers(pdf_path, groq_api_key):
    import warnings
    warnings.filterwarnings("ignore")
    import random
    import pdfplumber
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    from langchain.embeddings import HuggingFaceBgeEmbeddings
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_community.document_loaders import PyPDFLoader
    from langchain.docstore.document import Document
    from langchain_community.vectorstores import FAISS

    # Initialize HyDE components
    hyde_prompt = PromptTemplate(
        input_variables=["question"],
        template="""Generate a comprehensive hypothetical answer to: {question}
    Include key facts, concepts, and relevant context."""
    )

    hyde_llm = ChatGroq(
        groq_api_key=groq_api_key,
        model_name="llama3-70b-8192",
        temperature=0.1
    )
    hyde_chain = LLMChain(llm=hyde_llm, prompt=hyde_prompt)

    # Base embeddings model
    embedding_model = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-large-en",
        encode_kwargs={'normalize_embeddings': False}
    )

    # Wrap with HyDE
    hyde_embeddings = HypotheticalDocumentEmbedder(
        llm_chain=hyde_chain,
        base_embeddings=embedding_model,
        include_original=True
    )

    def get_header_footer(pdf_path, threshold=0.71):
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            if total_pages >= 15:
                random_page_nos = random.sample(range(5, total_pages), 10)
            else:
                random_page_nos = list(range(total_pages))

            avg_similarity = 1
            header_lines = -1
            while avg_similarity > threshold and header_lines < 4:
                header_lines += 1
                five_lines = []
                for page_no in random_page_nos:
                    lines = pdf.pages[page_no].extract_text().split('\n')
                    if len(lines) > header_lines:
                        five_lines.append(lines[header_lines])
                similarities = cosine_similarity(embed_texts(five_lines))
                avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])

            avg_similarity = 1
            footer_lines = -1
            while avg_similarity > threshold and footer_lines < 4:
                footer_lines += 1
                five_lines = []
                for page_no in random_page_nos:
                    lines = pdf.pages[page_no].extract_text().split('\n')
                    if len(lines) > footer_lines:
                        five_lines.append(lines[-(footer_lines + 1)])
                similarities = cosine_similarity(embed_texts(five_lines))
                avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])
            return header_lines, footer_lines


    def extract_text(pdf_path):
        header_lines, footer_lines = get_header_footer(pdf_path)
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    lines = page_text.split('\n')
                    if lines:
                        page_text = '\n'.join(lines[header_lines:-(footer_lines + 1)])
                        text += page_text + '\n'
            return text

    text = extract_text(pdf_path)

    def get_vectorstore1():
        texts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)
        docs = [Document(text) for text in texts if text.strip()]
        vectorstore = FAISS.from_documents(docs, hyde_embeddings)  # Use HyDE embeddings
        return vectorstore

    def get_vectorstore2():
        texts = RecursiveCharacterTextSplitter(chunk_size=6000, chunk_overlap=400).split_text(text)
        docs = [Document(text) for text in texts if text.strip()]
        vectorstore = FAISS.from_documents(docs, hyde_embeddings)  # Use HyDE embeddings
        return vectorstore

    retriever1 = get_vectorstore1().as_retriever(search_kwargs={"k": 6})
    retriever2 = get_vectorstore2().as_retriever(search_kwargs={"k": 3})
    return retriever1, retriever2

def web_search(query, max_results=3):
    """Perform web search using googlesearch-python"""
    from googlesearch import search
    results = list(search(query, num_results=max_results))
    return results[:max_results]

def fetch_content_from_link(link):
    try:
        if not link.startswith(('http://', 'https://')):
            link = f'https://{link}'
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        raw_text = soup.get_text()
        cleaned_text = ' '.join(raw_text.split())
        return cleaned_text
    except Exception as e:
        #print(f"Error fetching {link}: {str(e)}")
        return ""

def get_answer(query, retriever1, retriever2, groq_api_key):
    # Web search integration
    links = web_search(query)
    web_results = "\n".join([f"{i+1}. {fetch_content_from_link(link)}" for i, link in enumerate(links)])

    # HyDE-enhanced document retrieval
    doc_results1 = retriever1.get_relevant_documents(query)
    doc_results2 = retriever2.get_relevant_documents(query)
    doc_context = "\n---\n".join([doc.page_content for doc in doc_results1 + doc_results2])

    # Context management
    combined_context = f"""
    WEB SEARCH RESULTS:
    {web_results}

    DOCUMENT CONTENT:
    {doc_context}
    """
    if len(combined_context) > 4000:
        combined_context = combined_context[:4000]

    # LLM response generation
    llm = ChatGroq(
        groq_api_key=groq_api_key,
        model_name="llama3-70b-8192",
        temperature=0.05
    )

    prompt_template = """
    Analyze and synthesize information from both web results and document content to answer
    the question. Follow these steps:
    1. Identify key facts from web results
    2. Find supporting information in documents
    3. Combine insights from both sources
    4. If sources conflict, note this and prioritize document content
    5. Provide a clear, concise answer
    6. Do not restate the question. Provide a direct comparison of the answers focusing only on:
    7. Give a final judgment on which answer is better and why, without using phrases like 'based on web results' or unnecessary explanations.

    CONTEXT:
    {context}

    QUESTION: {question}

    FINAL ANSWER:
    """
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(context=combined_context, question=query)

def compare_answers(query, retriever1, retriever2, retriever3, retriever4, groq_api_key):
    answer1 = get_answer(query, retriever1, retriever2, groq_api_key)
    answer2 = get_answer(query, retriever3, retriever4, groq_api_key)

    comparison_prompt = f"""
    Compare the two answers given for the same question:

    QUESTION: {query}

    ANSWER 1: {answer1}

    ANSWER 2: {answer2}

    Do not restate the question. Provide a direct comparison of the answers focusing only on:
    1. Factual consistency
    2. Source reliability
    3. Completeness of information
    4. Clarity of presentation

    Give a final judgment on which answer is better and why, without using phrases like 'based on web results' or unnecessary explanations.
    """


    llm = ChatGroq(
        groq_api_key=groq_api_key,
        model_name="llama3-70b-8192",
        temperature=0.05
    )

    return llm.invoke(comparison_prompt).content

In [30]:
get_answer(query,retriever1,retriever2,groq_api_key)

'A projective space is a mathematical concept that extends a Euclidean space or affine space by adding "points at infinity" in such a way that there is one point at infinity for each direction of parallel lines. It can be defined as the set of vector lines in a vector space of higher dimension, or equivalently, as a sphere where antipodal points are identified.'

In [31]:
retriever5,retriever6=get_retrievers("/content/UN ECE R130.pdf",groq_api_key)
"""retiever7,retriever8=get_retievers("")"""

ValidationError: 1 validation error for HypotheticalDocumentEmbedder
include_original
  Extra inputs are not permitted [type=extra_forbidden, input_value=True, input_type=bool]
    For further information visit https://errors.pydantic.dev/2.10/v/extra_forbidden