In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-l6-v2")


  from tqdm.autonotebook import tqdm, trange


In [2]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
import cassio

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")


In [7]:
from langchain_astradb import AstraDBVectorStore
vector_store=AstraDBVectorStore(
    collection_name="astra_vector_langchain",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN
)

In [8]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)


In [9]:
documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids=[str(uuid4()) for _ in documents]
vector_store.add_documents(documents,ids=uuids)

['88135d2c-46cf-46c9-ab5e-b5569e2deff2',
 'fbb6fa27-6c58-407f-ad5f-297a4d96886e',
 '1448880e-f72b-4e94-ad2c-7437fc01b2cd',
 '6480f765-ff5f-4073-a9d6-8eb07bc70d41',
 '7e1e6080-6af9-4e1d-8db6-30b16bc1610d',
 '64cc2da6-29d3-4402-b4d8-895c992090be',
 'ed172e8b-fded-4957-a41c-fd39c954ef1f',
 'ae68a51b-eb44-4b1d-8d45-8816a02ef936',
 '8aaabbfe-a2bc-4c49-8fb4-b1873e5440b0',
 '1c692843-6cad-476b-bdab-4776bdaf5dd0']

In [12]:
results=vector_store.similarity_search(
     "LangChain provides abstractions to make working with LLMs easy",
      k=2,
      filter={"source": "tweet"},
)
for res in results:
    print(f"*{res.page_content} - [Source: {res.metadata['source']}]")

*Building an exciting new project with LangChain - come check it out! - [Source: tweet]
*LangGraph is the best framework for building stateful, agentic applications! - [Source: tweet]


In [14]:
from PyPDF2 import PdfReader
pdf_reader=PdfReader("budget_speech.pdf")

In [22]:
raw_text=""
for page in range(len(pdf_reader.pages)):
    raw_text+=pdf_reader.pages[page].extract_text()
    

In [23]:
len(raw_text)

96110

In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=600
)

text_documents=text_splitter.create_documents(texts=[raw_text])

In [29]:
uuids=[str(uuid4()) for _ in text_documents]
vector_store.add_documents(text_documents,ids=uuids)

['fb819455-d9a0-4a59-bda7-2a9d0c2ee6ed',
 'a8b0b707-7bfa-447c-867b-c19129993c68',
 '5f7514a4-b546-4272-8e6d-4c4c6ab33f17',
 '638e64b4-6e99-4b44-819e-5e5934efb1a7',
 '4a574065-3911-4be7-bf6c-cd48f6d78697',
 '3ae37986-5d21-44c2-b19d-b71338271a6f',
 'dcc6203a-7094-4888-9943-39ec53c20e6d',
 '93411d23-8f89-4ba5-ab87-416357de1541',
 'bdd256e2-95c6-418e-9f8a-695750cbc68d',
 '0f9b6e0d-65cd-4486-8b74-8f3f60f53efd',
 '73cc9bd9-fc3a-4986-8e6e-29d8fab313a8',
 '4e56c700-cb91-4edc-af13-e775b944bb5a',
 '3543a4cc-1d96-4409-b583-3b24bd8ef270',
 '34e81d71-a294-49c3-b965-e66bbf6fbbf0',
 '499c42ed-2e1d-4842-a256-2fa060037aca',
 'a99a9992-1231-4adf-9795-e21b398309b6',
 'fd765e44-e5f7-47f1-a166-5c79e2bf913e',
 '684fa480-7e5a-4bc6-bec4-ad1a611e69f5',
 'e3283220-75f0-4904-8c2d-cdae3e835e3b',
 '7fe29bfa-aa9a-406d-a068-c807a8fb3049',
 '76e94397-ecd1-456a-a1d1-4dbc34a9150f',
 'a1d9ef23-d6d1-4f7b-a844-544f5c82821e',
 'd056df92-fcc5-4165-bb61-9ac35b641141',
 '2311fab4-ac54-435f-b818-4c6e09966f8b',
 '98237120-f96b-

In [33]:
query="How much budget is allocated to energy department?"
resp=vector_store.similarity_search_with_score(query)
resp

[(Document(page_content='70. Building on the success of PM SVANidhi Scheme in transforming the \nlives of street vendors, our Government envisions a scheme to support each \nyear, over the next five years, the development of 100 weekly ‘haats’ or street food hubs in select cities.  \nStamp Duty  \n71. We will encourage states which continue to charge high stamp duty to \nmoderate the rates for all, and also consider further lowering duties for properties purchased by women. This reform will be made an essential \ncomponent of urban deve lopment schemes.   14  \n \nPriority 6: Energy Security  \nEnergy Transition  \n72. In the interim budget, I had announced our strategy to sustain high \nand more resource -efficient economic growth, along with energy security in \nterms of availability, accessibility and affordab ility.  We will bring out a policy \ndocument on appropriate energy transition pathways that balances the \nimperatives of employment, growth and environmental sustainability.

In [39]:
for doc,score in resp:
    print("[%0.4f] \%s "%(score,doc.page_content))

[0.6774] \70. Building on the success of PM SVANidhi Scheme in transforming the 
lives of street vendors, our Government envisions a scheme to support each 
year, over the next five years, the development of 100 weekly ‘haats’ or street food hubs in select cities.  
Stamp Duty  
71. We will encourage states which continue to charge high stamp duty to 
moderate the rates for all, and also consider further lowering duties for properties purchased by women. This reform will be made an essential 
component of urban deve lopment schemes.   14  
 
Priority 6: Energy Security  
Energy Transition  
72. In the interim budget, I had announced our strategy to sustain high 
and more resource -efficient economic growth, along with energy security in 
terms of availability, accessibility and affordab ility.  We will bring out a policy 
document on appropriate energy transition pathways that balances the 
imperatives of employment, growth and environmental sustainability.   
PM Surya Ghar Muft Bijli 