In [16]:
from langchain_community.document_loaders import PyPDFDirectoryLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
load_dotenv()


True

In [8]:
file = PyPDFDirectoryLoader(
    path="./data",
    glob="*.pdf"
)
document = file.load()

In [13]:
chunks = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=50).split_documents(document)


In [None]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
model = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

In [21]:
vector_store = FAISS.from_documents(embedding=embedding_model , documents=chunks)
vector_store.save_local(folder_path="./vector_store")

In [23]:
vector_store = FAISS.load_local(embeddings=embedding_model,folder_path="./vector_store",allow_dangerous_deserialization=True)

In [None]:
vector_store.similarity_search("gat test date")

[Document(id='7bb002e3-0f6f-4c3f-8aca-9ce26a050992', metadata={'producer': 'Skia/PDF m135', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'creationdate': '2025-05-06T08:08:24+00:00', 'title': 'NTS - Candidate (Portal)', 'moddate': '2025-05-06T08:08:24+00:00', 'source': 'data\\NTS - Candidate (Portal).pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Post  Name :\xa0 \xa0 GAT  - General\nPaper  Type :\xa0 \xa0 GAT - A\nTest  Date :\xa0 \xa0 Saturday  10th  May,  2025'),
 Document(id='a986b287-fc11-493b-b32e-6de59fa03846', metadata={'producer': 'Skia/PDF m135', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'creationdate': '2025-05-06T08:08:24+00:00', 'title': 'NTS - Candidate (Portal)', 'moddate': '2025-05-06T08:08:24+00:00', 'source': 'data\\NTS - Candidate (Portal).pdf', 'total_pages': 1, 'page': 0, 'page_labe

In [26]:
retriever = vector_store.as_retriever(serch_type="mmr",search_kwargs={"k":3})

In [29]:
def clean(document):
    return "\n\n".join(doc.page_content for doc in document)

from IPython.display import display,Markdown
display(Markdown(clean(retriever.invoke("gat test date"))))

Post  Name :    GAT  - General
Paper  Type :    GAT - A
Test  Date :    Saturday  10th  May,  2025

Test  Date :    Saturday  10th  May,  2025
Reporting  Time :    2 : 00  PM

Graduate Assessment Test
(GAT General 2025-III)
(Roll Number Slip)

In [32]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template="""anser the question from the given text \n
    {context}

    {query}
    """,
    input_variables=["context","query"]

)
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [36]:
from langchain_core.runnables import RunnablePassthrough,RunnableLambda,RunnableParallel

chain = RunnableParallel({
    "context":retriever | RunnableLambda(clean),
    "query":RunnablePassthrough()
    
})

f = chain | prompt | model | parser
f.invoke("gat test date")

'The GAT test date is Saturday, 10th May, 2025.'

In [37]:
from langchain.chains.query_constructor.base import StructuredQueryOutputParser,get_query_constructor_prompt

In [None]:

p = get_query_constructor_prompt(
    
)

<function langchain.chains.query_constructor.base.get_query_constructor_prompt(document_contents: 'str', attribute_info: 'Sequence[Union[AttributeInfo, dict]]', *, examples: 'Optional[Sequence]' = None, allowed_comparators: 'Sequence[Comparator]' = (<Comparator.EQ: 'eq'>, <Comparator.NE: 'ne'>, <Comparator.GT: 'gt'>, <Comparator.GTE: 'gte'>, <Comparator.LT: 'lt'>, <Comparator.LTE: 'lte'>, <Comparator.CONTAIN: 'contain'>, <Comparator.LIKE: 'like'>, <Comparator.IN: 'in'>, <Comparator.NIN: 'nin'>), allowed_operators: 'Sequence[Operator]' = (<Operator.AND: 'and'>, <Operator.OR: 'or'>, <Operator.NOT: 'not'>), enable_limit: 'bool' = False, schema_prompt: 'Optional[BasePromptTemplate]' = None, **kwargs: 'Any') -> 'BasePromptTemplate'>

In [None]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content="A group of scientists bring back dinosaurs, leading to unexpected chaos.",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"}
    ),
    Document(
        page_content="A man navigates dreams within dreams to plant an idea into a target's subconscious.",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2}
    ),
    Document(
        page_content="A psychologist delves into a series of dreams, blurring the lines between reality and imagination.",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6}
    ),
    Document(
        page_content="The lives of four sisters unfold in 19th-century America, highlighting their personal growth and challenges.",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3}
    ),
    Document(
        page_content="Toys come to life and embark on adventures when humans aren't around.",
        metadata={"year": 1995, "genre": "animated"}
    ),
    Document(
        page_content="Three men venture into a mysterious zone, facing existential questions and challenges.",
        metadata={"year": 1979, "director": "Andrei Tarkovsky", "genre": "thriller", "rating": 9.9}
    ),
]



from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(docs, embedding_model)




from langchain.chains.query_constructor.schema import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie, e.g., science fiction, comedy, drama, thriller, romance, action, animated.",
        type="string"
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released.",
        type="integer"
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director.",
        type="string"
    ),
    AttributeInfo(
        name="rating",
        description="A rating for the movie on a scale from 1 to 10.",
        type="float"
    ),
]



from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

document_content_description = "Brief summary of a movie"

retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectorstore,
    document_content_description=document_content_description,
    metadata_field_info=metadata_field_info,
    enable_limit=True  # Allows specifying the number of documents to retrieve
)


results = retriever.invoke("I want to watch a movie rated higher than 8.5")