In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sentence_transformers
from pinecone import Pinecone as pc
from langchain.vectorstores import Pinecone
from dotenv import load_dotenv
from langchain_pinecone import PineconeVectorStore
import sentence_transformers
from langchain_groq import ChatGroq
import time
import os




In [2]:
import nltk

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADITYA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
urls = ['https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html',
        'https://cleartax.in/s/budget-2024-highlights',
        'https://www.hindustantimes.com/budget',
        'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=md']
loader = UnstructuredURLLoader(urls=urls)
data=loader.load()

In [4]:
type(data)

list

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs=text_splitter.split_documents(data)

In [6]:
type(docs)

list

In [7]:
docs[7]

Document(metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html'}, page_content='This initiative, alongside the comprehensive development plans for Rajgir and Nalanda and the support for Odisha’s diverse attractions, reflects a visionary approach to tourism. Promoting these regions will not only enhance India\'s cultural heritage but also drive substantial economic growth. The expected increase in tourism will create numerous jobs and stimulate growth in the hospitality, infrastructure, technology, and local craft sectors.\n\nThis holistic strategy positions India as a leading global destination, highlighting its rich cultural legacy while promoting sustainable economic development."\n\n24 Jul 2024, 09:45:48 PM IST\n\nBudget 2024 Key Highlights Live Updates: Rohan Bhargava, Co-Founder of CashKaro, said')

In [8]:
embeddings=HuggingFaceEmbeddings()
vector=embeddings.embed_query('Hello world')
vector[:5]

  embeddings=HuggingFaceEmbeddings()
  embeddings=HuggingFaceEmbeddings()


[0.026249654591083527,
 0.013395577669143677,
 -0.004533144645392895,
 -0.021791456267237663,
 0.05455189198255539]

In [9]:
len(vector)

768

In [11]:
PINECONE_INDEX_NAME='url'
os.environ['PINECONE_API_KEY']=('pcsk_6yjLVo_3pw6MjXiZx4VEcg6UycLcaHQHQod4BaHCb7pZnJrjxZdzUmNAiuWF6pqu9bJnA7')
docsearch=Pinecone.from_existing_index(index_name=PINECONE_INDEX_NAME,embedding=embeddings)
print("Index successfully created")

Index successfully created


In [12]:
vectorstore_from_docs=PineconeVectorStore.from_documents(docs,index_name=PINECONE_INDEX_NAME,embedding=embeddings)

In [13]:
docsearch=PineconeVectorStore.from_existing_index(PINECONE_INDEX_NAME,embeddings)

In [14]:
query= "what is budget"
docs=docsearch.similarity_search(query,k=3)
print(docs)

[]


In [15]:
retriever=docsearch.as_retriever(search_type='similarity',search_kwargs={'k':10})
retrieved_docs=retriever.invoke("What is union budget?")
retrieved_docs

[]

In [16]:
GROQ_API="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"
llm=ChatGroq(model_name="llama-3.2-1b-preview",api_key=GROQ_API)

In [17]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt=(
    "You are an assistant for question-answering tasks Use the following pieces of retrieved context to answer The question. If You dont know the answer ,say that you dont know. Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)
prompt=ChatPromptTemplate.from_messages([("system",system_prompt),("human","{input}"),])

In [18]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain)

In [19]:
response=rag_chain.invoke({"input":"Whats the budget for current year? "})
print(response["answer"])

I don't have access to real-time financial information or the most current data. However, I can suggest some reputable sources where you can find the latest budget information for your country or organization, such as the official government website or the website of the relevant financial institution.


In [20]:
response=rag_chain.invoke({"input":"Whats the budget for year 2050? "})
print(response["answer"])

There is no definitive budget for year 2050 as it's still 30 years away. However, the United Nations and various international organizations have set targets and projections for the future growth of global economies and budgets. For instance, the UN's Sustainable Development Goals, including Goal 16 on International Cooperation in Development, aim to increase global development co-operation by 2020 and enhance its effectiveness by 2030.


In [25]:
"""pine=pc(api_key=os.environ.get("PINECONE_API_KEY"))
index_name='url'
if index_name not in pine.list_indexes().names():
    pine.create_index(
        name=index_name,
        dimension=embeddings.dimension,
        metric="cosine",
    )
    # Wait for index to be ready
    while not pine.describe_index(index_name).status['ready']:
        time.sleep(1)

# See that it is empty
print("Index before upsert:")
print(pine.Index(index_name).describe_index_stats())
print("\n")"""

Index before upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 689}},
 'total_vector_count': 689}




In [28]:
import pinecone
# Initialize Pinecone with your API key
pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"))  # No environment specified

# Access the index by name
index = pinecone.index("url")  # Replace with your actual index name

# Example: Delete all entries in the index
index.delete(delete_all=True)

AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )

