In [1]:
import cohere
import langchain
import pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader  
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_cohere.embeddings import CohereEmbeddings  
from langchain_community.vectorstores import Pinecone as PineconeVectorstore  
from langchain_cohere.llms import Cohere  
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv
load_dotenv()
import os

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\Nagendra\AppData\Local\sagemaker\sagemaker\config.yaml


  from tqdm.autonotebook import tqdm


In [2]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')
COHERE_API_KEY = os.getenv('COHERE_API_KEY')

In [3]:
cohere_client = cohere.Client(COHERE_API_KEY)

In [4]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

doc = read_doc('data/')

In [5]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

documents = chunk_data(docs=doc)

In [6]:
embeddings = CohereEmbeddings(model="embed-english-v2.0")

In [7]:
index_name = "qa-bot-cohere"  

In [8]:
## Initialize Pinecone and check if the index exists
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

# If index doesn't exist, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=4096, 
        metric='cosine', 
        spec=ServerlessSpec(cloud='aws', region='us-east-1')  # Adjust cloud and region as necessary
    )

In [9]:
index = pc.Index(index_name)

pinecone_index = PineconeVectorstore.from_documents(documents, embeddings, index_name=index_name)

In [10]:
def retrieve_query(query):
    matching_results = pinecone_index.similarity_search(query)
    return matching_results

In [11]:
llm = Cohere(temperature=0.1)  
chain = load_qa_chain(llm, chain_type="stuff")

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff")


In [14]:
def retrieve_answers(query,k=2):
    doc_search = retrieve_query(query)
    print(doc_search)
    
    response = chain.run(input_documents=doc_search, question=query)
    return response

In [15]:
# Example query
our_query = "What are the responsibilities mentioned in the document?"
answer = retrieve_answers(our_query)
print(answer)

[Document(metadata={'page': 81.0, 'source': 'data\\NASDAQ_MSFT_2023.pdf'}, page_content='that (1)  pertain to the maintenance of records that, in reasonable detail, accurately and fairly reflect the transactions and \ndispositions o f the assets of the company; (2)  provide reasonable assurance that transactions are recorded as necessary \nto permit preparation of financial statements in accordance with generally accepted accounting principles, and that receipts \nand expenditures of the c ompany are being made only in accordance with authorizations of management and directors of \nthe company; and (3)  provide reasonable assurance regarding prevention or timely detection of unauthorized acquisition, \nuse, or disposition of the company’s assets that could have a material effect on the financial statements.'), Document(metadata={'page': 81.0, 'source': 'data\\NASDAQ_MSFT_2023.pdf'}, page_content='that (1)  pertain to the maintenance of records that, in reasonable detail, accurately and

  response = chain.run(input_documents=doc_search, question=query)


 The responsibilities in the document pertain to maintaining detailed and accurate records of transactions and assets, 
ensuring transactions are recorded to prepare financial statements, and that all expenditures are done so with the proper 
authorizations from management. 
These measures are meant to provide reasonable assurance that any unauthorized acquisition, use, or disposition of the 
company's assets that could affect the financial statements can be detected in a timely manner. 
These measures are set forth to provide accountability and transparency within the company. 
If there is anything more specific you would like me to answer regarding the above responsibilities, I'd be happy to. 
