In [5]:
!pip install langchain -q

In [6]:
!pip install sentence_transformers -q

In [7]:
!pip install openai -q

In [8]:
!pip install unstructured -q

In [9]:
!pip install -q unstructured[local-inference] -q

In [39]:
!pip install pinecone-client -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import os
os.getcwd()

'/home/kousik/ABC_OTHERS/BERRRTTTTTT/chatbot/notebook'

In [16]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader('../data')
documents = loader.load()

In [17]:
len(documents)

1

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)


In [19]:
len(docs)

243

### Getting Embeddings of documents
[https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2]

In [47]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Equivalent to SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [48]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
#query_result

In [49]:
len(query_result)

384

In [50]:
doc_result = embeddings.embed_documents([text, "This is not a test document."])
#doc_result

In [51]:
len(doc_result)

2

In [52]:
len(doc_result[0])

384

### For upserting into pinecone


In [54]:
import pinecone 
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key="70ed5d09-f5a0-4305-a5ee-845251348d13", 
    environment="us-west4-gcp-free" 
)

index_name = "indian-budget-chatbot"
indexes = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [56]:
query = "what is Indirect Taxes"
indexes.similarity_search(query,k=2)

[Document(page_content='24\n\n\n\nFiscal Management\n\nPART B\n\nIndirect Taxes\n\n27\n\nGreen Mobility \uf0b7 Electronics \uf0b7 Electrical \uf0b7 Chemicals and Petrochemicals \uf0b7 Marine products \uf0b7 \uf0b7 \uf0b7 Metals \uf0b7 Compounded Rubber \uf0b7 Cigarettes\n\nLab Grown Diamonds Precious Metals\n\nDirect Taxes\n\n30\n\nMSMEs and Professionals \uf0b7 Cooperation \uf0b7 Start-Ups \uf0b7 Appeals \uf0b7 Better targeting of tax concessions \uf0b7 Rationalisation \uf0b7 Others \uf0b7\n\nPersonal Income Tax\n\nAnnexures\n\n35\n\nAnnexure to Part B of the Budget Speech 2023-24', metadata={'source': '../data/budget_speech.pdf'}),
 Document(page_content='30\n\nDirect Taxes\n\n132. I now come to my direct tax proposals. These proposals aim to maintain continuity and stability of taxation, further simplify and rationalise various provisions to reduce the compliance burden, promote the entrepreneurial spirit and provide tax relief to citizens.', metadata={'source': '../data/budget_spee

In [57]:
# Let's create a function to get `similiar_docs`

def get_similiar_docs(query,k=5,score=False):
    if score:
        similar_docs = indexes.similarity_search_with_score(query,k=k)
    else:
        similar_docs = indexes.similarity_search(query,k=k)
    return similar_docs

In [59]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key="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")

In [60]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")

def get_answer(query):
    similar_docs = get_similiar_docs(query)
    answer =  chain.run(input_documents=similar_docs, question=query)
    return  answer

query = "Indirect Taxes budget?"
get_answer(query)

'In the Budget for 2023-24, the Minister of Finance, Nirmala Sitharaman, proposed indirect tax measures that aim to promote exports, boost domestic manufacturing, enhance domestic value addition, and encourage green energy and mobility. However, specific details about the indirect tax budget are not provided in the given context.'

In [61]:
query = "Income tax?"
get_answer(query)

'The context provided is about changes in the income tax regime in India. The government has proposed a new tax structure with reduced tax rates and increased tax exemption limits for individuals. The new regime has five income slabs and offers a standard deduction of `50,000 for salaried individuals. Additionally, the rebate limit has been increased to `7 lakh, meaning individuals with income up to `7 lakh will not have to pay any tax. This will provide relief to taxpayers, especially those in the middle-class.'

### For fetching from pinecone

In [67]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

query = "What are the financial inclusion initiatives in India?"
embedding_vector = model.encode(query)
embedding_vector = embedding_vector.tolist()

In [68]:
import pinecone      

pinecone.init(      
    api_key='70ed5d09-f5a0-4305-a5ee-845251348d13',      
    environment='us-west4-gcp-free'      
)      
index = pinecone.Index('indian-budget-chatbot')  

pinecone_resp = index.query(embedding_vector,top_k=2,include_metadata=True) # https://docs.pinecone.io/docs/python-client#indexquery
pinecone_resp

{'matches': [{'id': '9d586cb7-94d6-4c49-8719-fcc202f5efc0',
              'metadata': {'source': '../data/budget_speech.pdf',
                           'text': '3\n'
                                   '\n'
                                   '9. The economy has become a lot more '
                                   'formalised as reflected in the EPFO '
                                   'membership more than doubling to 27 crore, '
                                   'and 7,400 crore digital payments of ` 126 '
                                   'lakh crore through UPI in 2022.\n'
                                   '\n'
                                   '10. schemes, with universalisation of '
                                   'targeted benefits, has resulted in '
                                   'inclusive development. Some of the schemes '
                                   'are:\n'
                                   '\n'
                                   'The\n'
               

In [69]:
pinecone_resp['matches'][0]['metadata']['text']

'3\n\n9. The economy has become a lot more formalised as reflected in the EPFO membership more than doubling to 27 crore, and 7,400 crore digital payments of ` 126 lakh crore through UPI in 2022.\n\n10. schemes, with universalisation of targeted benefits, has resulted in inclusive development. Some of the schemes are:\n\nThe\n\nefficient\n\nimplementation\n\nof many\n\ni. 11.7 crore household toilets under Swachh Bharat Mission,\n\nii. 9.6 crore LPG connections under Ujjawala,'

In [70]:
context = pinecone_resp['matches'][0]['metadata']['text'] + pinecone_resp['matches'][1]['metadata']['text']

In [71]:
# Let's build a function

def find_match(input_query):
    input_embed = model.encode(input_query).tolist()
    result = index.query(input_embed, top_k=2, includeMetadata=True)
    return result['matches'][0]['metadata']['text']+"\n"+result['matches'][1]['metadata']['text']

In [72]:
query = "What are the financial inclusion initiatives in India?"

In [74]:
context = find_match(query)
context

'3\n\n9. The economy has become a lot more formalised as reflected in the EPFO membership more than doubling to 27 crore, and 7,400 crore digital payments of ` 126 lakh crore through UPI in 2022.\n\n10. schemes, with universalisation of targeted benefits, has resulted in inclusive development. Some of the schemes are:\n\nThe\n\nefficient\n\nimplementation\n\nof many\n\ni. 11.7 crore household toilets under Swachh Bharat Mission,\n\nii. 9.6 crore LPG connections under Ujjawala,\nGOVERNMENT OF INDIA\n\nBUDGET 2023-2024\n\nSPEECH\n\nOF NIRMALA SITHARAMAN MINISTER OF FINANCE\n\nFebruary 1, 2023\n\nCONTENTS\n\nPART-A\n\nPage No. 1 2 3 5\n\n\uf0b7 Achievements since 2014: Leaving no one behind \uf0b7 Vision for Amrit Kaal – an empowered and inclusive economy \uf0b7\n\nIntroduction\n\nPriorities of this Budget i. Inclusive Development ii. Reaching the Last Mile iii. iv. Unleashing the Potential v. Green Growth vi. Youth Power vii. Financial Sector\n\nInfrastructure and Investment\n\n24\n\n\n\

## Chat-interface with Conversation Memory

In [41]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
    MessagesPlaceholder
)


llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key="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")

In [42]:
system_msg_template = SystemMessagePromptTemplate.from_template(template="""Answer the question as truthfully as possible using the provided context, 
and if the answer is not contained within the text below, say 'I don't know'""")

human_msg_template = HumanMessagePromptTemplate.from_template(template="{input}")

prompt_template = ChatPromptTemplate.from_messages([system_msg_template, MessagesPlaceholder(variable_name="history"), human_msg_template])

In [43]:
#conversation = ConversationChain(memory=st.session_state.buffer_memory, prompt=prompt_template, llm=llm, verbose=True)

memory = ConversationBufferWindowMemory(k=3,return_messages=True)
conversation = ConversationChain(memory=memory, prompt=prompt_template, llm=llm, verbose=True)


### Chat-conversation without any context of our documents


In [73]:
conversation.predict(input="indian economy?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Answer the question as truthfully as possible using the provided context, 
and if the answer is not contained within the text below, say 'I don't know'
Human: Income tax?
AI: I don't know.
Human: Income tax regime?
AI: I don't know.
Human: TAX RELIEF UNDER NEW PERSONAL TAX REGIME?
AI: I don't have access to real-time information or specific details about income tax regimes or tax relief under a new personal tax regime. It would be best to consult with a tax professional or refer to the latest tax laws and regulations in your country for accurate and up-to-date information.
Human: indian economy?[0m

[1m> Finished chain.[0m


'The Indian economy is one of the largest in the world and has been experiencing both growth and challenges. Prior to the COVID-19 pandemic, India was among the fastest-growing major economies. However, the pandemic has had a significant impact on the Indian economy, resulting in a contraction in growth. The government has implemented various measures to revive the economy, including fiscal stimulus packages and structural reforms. The long-term growth potential of the Indian economy remains strong, driven by factors such as a young and growing workforce, a large consumer base, and ongoing economic reforms.'

### Chat-conversation with the context of our documents

```
query = "What are the financial inclusion initiatives in India?"

context = find_match(query)

conversation.predict(input=f"Context:\n {context} \n\n Query:\n{query}")

```

In [45]:
query = "What are the financial inclusion initiatives in India?"
context = find_match(query)

conversation.predict(input=f"Context:\n {context} \n\n Query:\n{query}")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Answer the question as truthfully as possible using the provided context, 
and if the answer is not contained within the text below, say 'I don't know'
Human: Context:
 3

9. The economy has become a lot more formalised as reflected in the EPFO membership more than doubling to 27 crore, and 7,400 crore digital payments of ` 126 lakh crore through UPI in 2022.

10. schemes, with universalisation of targeted benefits, has resulted in inclusive development. Some of the schemes are:

The

efficient

implementation

of many

i. 11.7 crore household toilets under Swachh Bharat Mission,

ii. 9.6 crore LPG connections under Ujjawala,GOVERNMENT OF INDIA

BUDGET 2023-2024

SPEECH

OF NIRMALA SITHARAMAN MINISTER OF FINANCE

February 1, 2023

CONTENTS

PART-A

Page No. 1 2 3 5

 Achievements since 2014: Leaving no one behind  Vision for Amrit Kaal – an empowered and inclusive economy 

Introduction

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=60).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..



[1m> Finished chain.[0m


'Based on the provided context, the financial inclusion initiatives in India include the formalization of the economy reflected in the EPFO membership doubling to 27 crore and 7,400 crore digital payments of ₹126 lakh crore through UPI in 2022. Additionally, there is mention of schemes with universalization of targeted benefits, such as the provision of 11.7 crore household toilets under the Swachh Bharat Mission and 9.6 crore LPG connections under Ujjawala.'

### Create Query refiner

In [3]:
import openai
openai.api_key = "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"

conversation_log = """Human: Income tax?
AI: I don't know.
Human: Income tax regime?
AI: I don't know.
Human: TAX RELIEF UNDER NEW PERSONAL TAX REGIME?
AI: I don't have access to real-time information or specific details about income tax regimes or tax relief under a new personal tax regime. It would be best to consult with a tax professional or refer to the latest tax laws and regulations in your country for accurate and up-to-date information.
Human: indian economy?
"""

query = "financial inclusion?"

response = openai.Completion.create(
    model="text-davinci-003",
    prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation_log}\n\nQuery: {query}\n\nRefined Query:",
    temperature=0.7,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)

In [9]:
response["choices"][0]['text']

' What are the financial inclusion initiatives in India?'

In [None]:
# Let's build a function

def query_refiner(conversation_log, query):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation_log}\n\nQuery: {query}\n\nRefined Query:",
        temperature=0.7,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response['choices'][0]['text']