In [52]:
!pip install chromadb
!pip install langchain_groq
!pip install unstructured
!pip install pdfminer.six
!pip install langchain
!pip install langchain_community



In [32]:
# from dotenv import load_dotenv
import os
# load_dotenv()
os.environ["GROQ_API_KEY"]="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"

In [148]:
from langchain.document_loaders import WebBaseLoader
import bs4

web_loader = WebBaseLoader(web_paths=("https://en.wikipedia.org/wiki/Robotics",),
                           bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                               class_=("mw-body-content")
                           )))
web_documents = web_loader.load()
print(web_documents)



In [149]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
splits = text_splitter.split_documents(web_documents)
print(splits)
print(f"Your {len(web_documents)} documents have been split into {len(splits)} chunks")


Your 1 documents have been split into 97 chunks


In [157]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Hugging Face model for embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()


class CustomEmbeddings:
    def embed_documents(self, texts):
        return [get_embedding(chunk.page_content) for chunk in splits]
    def embed_query(self, text):
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        # Convert the embedding to a list
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()


embedding_model = CustomEmbeddings()



In [30]:
from langchain_community.vectorstores import Chroma

persist_directory = "db_storage"

db = Chroma.from_documents(splits, embedding_model, persist_directory=persist_directory)
db.persist()

In [31]:
from langchain_groq import ChatGroq
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate
import logging

In [32]:
logging.basicConfig()
logging.getLogger("langchain.retrievers.multiquery").setLevel(logging.INFO)

In [50]:
## Using multiquery vector

question = "When was robotics start getting used in industry "

llm = ChatGroq()
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever = db.as_retriever(), llm = llm
)

In [38]:
unique_docs_multi_query = multi_query_retriever.get_relevant_documents(query = question)

In [46]:
prompt_template = """
Use the following context to answer the question at the end. If you don't know the answer, just say that you don't know, dont try to make up an answer.
<context>
{context}
</context>
Question: {input}
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "input"])

In [47]:
# llm.predict(text=PROMPT.format_prompt(
#     context=unique_docs_multi_query,
#     question=question
# ))
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(ChatGroq(), PROMPT)

retriever = db.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({"context": unique_docs_multi_query, "input": question})


In [48]:
response

{'context': [Document(metadata={'source': 'https://en.wikipedia.org/wiki/Robotics'}, page_content='Manufacturing. Robots have been increasingly used in manufacturing since the 1960s. According to the Robotic Industries Association US data, in 2016 the automotive industry was the main customer of industrial robots with 52% of total sales.[5] In the auto industry, they can amount for more than half of the "labor". There are even "lights off" factories such as an IBM keyboard manufacturing factory in Texas that was fully automated as early as 2003.[6]\nAutonomous transport including airplane autopilot and self-driving cars\nDomestic robots including robotic vacuum cleaners, robotic lawn mowers, dishwasher loading[7] and flatbread baking.[8]\nConstruction robots. Construction robots can be separated into three types: traditional robots, robotic arm, and robotic exoskeleton.[9]\nAutomated mining.\nSpace exploration, including Mars rovers.\nEnergy applications including cleanup of nuclear co

In [49]:

response['answer']

'Robotics have been increasingly used in manufacturing since the 1960s, according to the provided context. This implies that the use of robotics in industry started around the 1960s and has been growing ever since.'

In [53]:
## Multi vector retrieval

from langchain.schema.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
import uuid

In [54]:
summarize_chain = load_summarize_chain(llm)

In [56]:
id_key = "doc_key"
summaries = []
for chunk in splits:
  unique_id = str(uuid.uuid4())
  chunk_summary = summarize_chain.run([chunk])
  chunk_summary_document = Document(page_content=chunk_summary, metadata={id_key: unique_id})
  summaries.append(chunk_summary_document)
  chunk.metadata[id_key] = unique_id

In [58]:
print(f"You have {len(summaries)} summaries to go along with {len(splits)} chunks")

You have 97 summaries to go along with 97 chunks


In [62]:
persist_directory_multi_vector = "db_storage_multi_vector"

db_multi_vector = Chroma.from_documents(summaries, embedding_model, collection_name="summaries", persist_directory=persist_directory_multi_vector)
db_multi_vector.persist()

In [63]:
docstore_multi_vector = InMemoryStore()

In [64]:
retriever_multi_vector = MultiVectorRetriever(
    vectorstore=db_multi_vector,
    docstore=docstore_multi_vector,
    id_key=id_key
)

In [None]:
# retriever_multi_vector.vectorstore.add_documents(summaries)

In [75]:
# _similar_docs = retriever_multi_vector.vectorstore.similarity_search(
#     question
# )
# _similar_docs[0]

In [65]:
retriever_multi_vector.docstore.mset([(x.metadata[id_key], x) for x in splits])

In [67]:
docs_retrieved_multi_vector = retriever_multi_vector.get_relevant_documents(question)
print(docs_retrieved_multi_vector[0].page_content)
print(docs_retrieved_multi_vector[0].metadata)

Manufacturing. Robots have been increasingly used in manufacturing since the 1960s. According to the Robotic Industries Association US data, in 2016 the automotive industry was the main customer of industrial robots with 52% of total sales.[5] In the auto industry, they can amount for more than half of the "labor". There are even "lights off" factories such as an IBM keyboard manufacturing factory in Texas that was fully automated as early as 2003.[6]
Autonomous transport including airplane autopilot and self-driving cars
Domestic robots including robotic vacuum cleaners, robotic lawn mowers, dishwasher loading[7] and flatbread baking.[8]
Construction robots. Construction robots can be separated into three types: traditional robots, robotic arm, and robotic exoskeleton.[9]
Automated mining.
Space exploration, including Mars rovers.
Energy applications including cleanup of nuclear contaminated areas[a]; and cleaning solar panel arrays.
Medical robots and Robot-assisted surgery designed 

In [134]:
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain


docs_chain = create_stuff_documents_chain(ChatGroq() , PROMPT)
retrieval_chain_multi_vector = create_retrieval_chain(retriever_multi_vector, document_chain)
# retriever_multi_vector.invoke(question)
response = retrieval_chain_multi_vector.invoke({"context": docs_retrieved_multi_vector, "input": question})

In [137]:
response['answer']

"Robotics have been increasingly used in manufacturing since the 1960s, according to the provided context. The first example given of this is from 2003, with an IBM keyboard manufacturing factory in Texas that was fully automated. However, it's likely that the use of robotics in industry started before 2003 and has been increasing steadily since then. Additionally, the context states that in 2016, the automotive industry was the main customer of industrial robots with 52% of total sales, indicating that the use of robotics in industry was well established by that point."

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140083 sha256=eceb8d42069db1e9bf1ea23ae13ca9064efc0b3604748487752e60b2536f1e5b
  Stored in directory: /root/.cache/pip/wheels/4e/c1/68/f7bd0a8f514661f76b5cbe3b5f76e0033d79f1296012cbbf72
Success

In [6]:
!pip install pi_heif

Collecting pi_heif
  Downloading pi_heif-0.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading pi_heif-0.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (984 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m984.8/984.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pi_heif
Successfully installed pi_heif-0.18.0


In [8]:
!pip install unstructured_inference

Collecting unstructured_inference
  Downloading unstructured_inference-0.7.37-py3-none-any.whl.metadata (5.9 kB)
Collecting layoutparser (from unstructured_inference)
  Downloading layoutparser-0.3.4-py3-none-any.whl.metadata (7.7 kB)
Collecting python-multipart (from unstructured_inference)
  Downloading python_multipart-0.0.10-py3-none-any.whl.metadata (1.9 kB)
Collecting onnx (from unstructured_inference)
  Downloading onnx-1.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting timm (from unstructured_inference)
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting iopath (from layoutparser->unstructured_inference)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py

Collecting yolox
  Downloading yolox-0.3.0.tar.gz (79 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m71.7/80.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.0/80.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting loguru (from yolox)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting thop (from yolox)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting ninja (from yolox)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Collecting onnx==1.8.1 (from yolox)
  Downloading onnx-1.8.1.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m45.6 MB/s[0m eta [3

In [7]:
!sudo apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 2s (123 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package poppler-utils.
(Reading database ... 123605 

In [11]:
!pip install pytesseract
!sudo apt-get install tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 3s (1,753 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [None]:
!pip install onnx
!pip install pdf2image
!pip install pikepdf
!pip install pypdf
!pip install google-cloud-vision
!pip install effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
!pip install unstructured-inference==0.7.36
!pip install unstructured.pytesseract>=0.3.12

Collecting pikepdf
  Downloading pikepdf-9.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Downloading pikepdf-9.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pikepdf
Successfully installed pikepdf-9.2.1
Collecting google-cloud-vision
  Downloading google_cloud_vision-3.7.4-py2.py3-none-any.whl.metadata (5.2 kB)
Downloading google_cloud_vision-3.7.4-py2.py3-none-any.whl (467 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.5/467.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-vision
Successfully installed google-cloud-vision-3.7.4


Collecting effdet
  Downloading effdet-0.4.1-py3-none-any.whl.metadata (33 kB)
Collecting omegaconf>=2.0 (from effdet)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from omegaconf>=2.0->effdet)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading effdet-0.4.1-py3-none-any.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.5/112.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wh

In [1]:
!export OCR_AGENT=unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract


In [28]:
## PDF Text SPlitters
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
filename = "pdf.pdf"

elements = partition_pdf(
    filename=filename,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,

    strategy="hi_res",
    infer_table_structure=True
)


In [35]:
!pip install langchain_core



In [58]:
!pip install pydantic



In [123]:
from langchain import hub
from langchain_groq import ChatGroq
from typing import Union, List
obj = hub.pull("wfh/proposal-indexing")
llm = ChatGroq()
runnable = obj | llm
from pydantic import BaseModel, validator
from langchain.chains import create_extraction_chain_pydantic
class Sentences(BaseModel):
  sentences:  List[str]

extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)

<ipython-input-123-595291215feb>:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  @validator('sentences', pre=True)


In [101]:
def get_propositions(text):
    runnable_output = runnable.invoke({'input': text}).content
    print("Runnable Output:", runnable_output)  # Debugging output
    propositions = extraction_chain.run(runnable_output)
    return propositions


In [144]:
for e in elements:
  text = e.text
  prop = get_propositions(text)
  props = [p.sentences for p in prop]
  print(props)

Runnable Output: [
"Sales prediction can be based on product titles and images",
"Deep learning approaches can be used for sales prediction",
"Product titles are a factor in sales prediction",
"Product images are a factor in sales prediction",
"Deep learning can be applied to product titles",
"Deep learning can be applied to product images",
"Deep learning is a type of machine learning",
"Machine learning can be used for sales prediction",
"Sales prediction is a use case for deep learning",
"Sales prediction is a use case for machine learning"
]
[['Sales prediction can be based on product titles and images', 'Product titles are a factor in sales prediction', 'Product images are a factor in sales prediction', 'Deep learning can be applied to product titles', 'Deep learning can be applied to product images', 'Deep learning is a type of machine learning', 'Machine learning can be used for sales prediction']]
Runnable Output: [
"Haishan Gao is an individual.",
"Zhaogiang Bai is an individu

KeyboardInterrupt: 

In [152]:
## Multivector with Proposition based retrieval

from pydantic import BaseModel, validator
from langchain.chains import create_extraction_chain_pydantic
filename = "pdf.pdf"
elements = partition_pdf(
    filename=filename,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    strategy="hi_res",
    infer_table_structure=True
)

# Set up the language model and extraction chain
llm = ChatGroq()
obj = hub.pull("wfh/proposal-indexing")
runnable = obj | llm

class Sentences(BaseModel):
  sentences:  List[str]

extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    runnable_output = runnable.invoke({'input': text}).content
    propositions = extraction_chain.run(runnable_output)
    return propositions




In [154]:
from langchain.schema import Document

id_key = "doc_key"
docstore_elements = []
vectorstore_propositions = []

for e in elements:
    text = e.text
    # Get propositions from the text
    prop = get_propositions(text)
    propositions = [p.sentences for p in prop]

    # Create a unique ID for the document
    unique_id = str(uuid.uuid4())

    # Store PDF elements in document store
    docstore_elements.append(Document(page_content=text, metadata={id_key: unique_id}))

    # Store each proposition in the vector store
    for sentence in propositions:
        for s in sentence:  # Ensure we iterate through the list of sentences
            chunk_summary_document = Document(page_content=s, metadata={id_key: unique_id})
            vectorstore_propositions.append(chunk_summary_document)


KeyboardInterrupt: 

In [159]:
from langchain.vectorstores import Chroma

persist_directory_multi_vector = "db_storage_multi_vector"

# Create and persist the vector store
db_multi_vector = Chroma.from_documents(vectorstore_propositions, embedding_model, collection_name="propositions", persist_directory=persist_directory_multi_vector)
db_multi_vector.persist()

# Create and persist the document store
# docstore_multi_vector = Chroma.from_documents(docstore_elements, embedding_model, collection_name="elements", persist_directory="db_storage_docstore")
# docstore_multi_vector.persist()


NameError: name 'InMemoryStore' is not defined

In [163]:
from langchain.retrievers import MultiVectorRetriever
from langchain.storage import InMemoryStore
docstore_multi_vector = InMemoryStore()

retriever_multi_vector = MultiVectorRetriever(
    vectorstore=db_multi_vector,
    docstore=docstore_multi_vector,
    id_key=id_key
)

# Link documents in the docstore
retriever_multi_vector.docstore.mset([(doc.metadata[id_key], doc) for doc in docstore_elements])


In [164]:
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever = retriever_multi_vector
)

In [165]:
question="What is the pdf about"

In [169]:
from langchain.chains.combine_documents import create_stuff_documents_chain


docs_chain = create_stuff_documents_chain(ChatGroq() , PROMPT)
docs_retrieved_multi_vector = compression_retriever.base_retriever.get_relevant_documents(question)
# compressor.compress_documents(documents=docs_retrieved_multi_vector, query=question)

# retrieval_chain_multi_vector = create_retrieval_chain(retriever_multi_vector, document_chain)
# retriever_multi_vector.invoke(question)
# response = retrieval_chain_multi_vector.invoke({"context": docs_retrieved_multi_vector, "input": question})

  docs_retrieved_multi_vector = compression_retriever.base_retriever.get_relevant_documents(question)


In [170]:
compressed_docs = compression_retriever.get_relevant_documents(question)


In [171]:
from langchain.prompts import PromptTemplate
prompt_template = """
Use the following context to answer the question at the end. If you don't know the answer, just say that you don't know, dont try to make up an answer.
<context>
{context}
</context>
Question: {input}
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "input"])

In [174]:
from langchain.chains.retrieval import create_retrieval_chain
retrieval_chain_compressed = create_retrieval_chain(retriever_multi_vector, docs_chain)
response = retrieval_chain_compressed.invoke({"context": retrieval_chain_compressed, "input": question})

In [175]:
response['answer']

'The PDF appears to be about a analysis or study on sales prediction using deep learning approaches. The feature importance listed in the context suggests that various product attributes, such as countries shipped to, badges count, discount ratio, use of ad boosts, price, retail price, shipping options, product color, and inventory total, were considered in the predictive model. However, without the full document, this is a general interpretation based on the provided context.'