**Installing the required packages and libraries**

In [None]:
!pip install langchain
!pip install Pinecone
!pip install cohere
!pip install langchain-community
!pip install langchain_pinecone
!pip install pypdf
!pip install TextLoader
!pip install csvloader
!pip install docx2txt

Collecting pypdf
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-5.0.1-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.0.1
Collecting TextLoader
  Downloading TextLoader-0.0.9-py3-none-any.whl.metadata (504 bytes)
Downloading TextLoader-0.0.9-py3-none-any.whl (12 kB)
Installing collected packages: TextLoader
Successfully installed TextLoader-0.0.9
[31mERROR: Could not find a version that satisfies the requirement Docx2txtLoader (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for Docx2txtLoader[0m[31m
[0m

**Importing the Packages**

In [None]:
import os
from dotenv import load_dotenv
from langchain.llms import Cohere
from langchain.document_loaders import PyPDFDirectoryLoader, Docx2txtLoader, CSVLoader, TextLoader
from langchain.embeddings import CohereEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec

**Loading the API_Keys into the working environment**

In [None]:
cohere_api_key=''
pinecone_api_key=''
pinecone_api_env=''
os.environ["COHERE_API_KEY"]=cohere_api_key
os.environ["PINECONE_API_KEY"]=pinecone_api_key
os.environ["PINECONE_API_ENV"]=pinecone_api_env

**class for loading the documents**

In [None]:
class DocumentLoader:
    """Class to load documents from a specified directory."""

    def __init__(self, directory):

        self.directory = directory

    def load_documents(self):
        documents = []

        # Ensure the directory exists
        if not os.path.isdir(self.directory):
            print(f"Error: Directory '{self.directory}' does not exist.")
            return []

        # Iterate over files in the directory and load based on their extensions
        for file_name in os.listdir(self.directory):
            file_path = os.path.join(self.directory, file_name)

            if file_name.endswith(".pdf"):
                loader = PyPDFDirectoryLoader(self.directory)
                documents.extend(loader.load())
            elif file_name.endswith(".txt"):
                loader = TextLoader(file_path)
                documents.extend(loader.load())
            elif file_name.endswith(".csv"):
                loader = CSVLoader(file_path)
                documents.extend(loader.load())
            elif file_name.endswith(".docx"):
                loader = Docx2txtLoader(file_path)
                documents.extend(loader.load())
            else:
                print(f"Unsupported file type: {file_name} (skipped)")

        return documents

**Class for creating and storing embeddings into the vectordatabase**

In [None]:
class EmbeddingManager:
    """Class to manage embeddings using Cohere."""

    def __init__(self, api_key):
        self.api_key = api_key
        self.embeddings = CohereEmbeddings(cohere_api_key=self.api_key, model='embed-english-v3.0',user_agent='langchain')

    def embed_documents(self, splitted_text, pinecone_index_name, pinecone_api_key):
        return PineconeVectorStore.from_documents(
            splitted_text,
            embedding=self.embeddings,
            pinecone_api_key=pinecone_api_key,
            index_name=pinecone_index_name
        )

**Class for managing all the Pinecone related tasks**

In [None]:
class PineconeManager:
    """Class to manage Pinecone operations."""

    def __init__(self, api_key, api_env):
        self.client = Pinecone(pinecone_api_key=api_key)
        self.api_env = api_env

    def create_index(self, index_name, dimension=1024, metric='cosine'):
        index_list = self.client.list_indexes().names()
        if index_name not in index_list:
            self.client.create_index(
                dimension=dimension,
                name=index_name,
                metric=metric,
                spec=ServerlessSpec(cloud='aws', region=self.api_env)
            )
        else:
            print(f"Index '{index_name}' already exists")

**Query Handling Class**

In [None]:
class QAHandler:
    """Class to manage multiple QA objects using the same vector store."""

    def __init__(self, vector_store, llm):
        self.vector_store = vector_store
        self.llm = llm

    def create_qa(self, query_text, k=3):
        retriever = self.vector_store.as_retriever(seach_kwargs={"k": k})
        qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type='stuff', retriever=retriever)
        response = qa.run(query_text)
        return response

**Loading Documents to colab from local machine**

In [None]:
!unzip "./pdfs.zip"

Archive:  ./pdfs.zip
   creating: pdfs/
  inflating: pdfs/7181-attention-is-all-you-need.pdf  
  inflating: pdfs/Resume.pdf         


**Main function**

In [None]:
def main():
    # Initialize clients
    api_key = os.environ.get('COHERE_API_KEY')
    pinecone_api_key = os.environ.get('PINECONE_API_KEY')
    pinecone_api_env = os.environ.get('PINECONE_API_ENV')

    cohere_llm = Cohere(cohere_api_key=api_key)
    pinecone_manager = PineconeManager(pinecone_api_key, pinecone_api_env)

    # Create Pinecone index
    pinecone_index_name = 'test'
    pinecone_manager.create_index(pinecone_index_name)

    # Load documents
    directory = "/content/pdfs"
    document_loader = DocumentLoader(directory)
    documents = document_loader.load_documents()

    # Split text
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
    splitted_text = splitter.split_documents(documents)

    # Embed and store documents
    embedding_manager = EmbeddingManager(api_key)
    vector_store = embedding_manager.embed_documents(splitted_text, pinecone_index_name, pinecone_api_key)

    # Perform retrieval
    qa_handler = QAHandler(vector_store, cohere_llm)


        # Example: Multiple queries handled
    query_1 = "Who is Gourav Joshi?"
    query_2 = "Is Gourav Joshi interested in football ?"
    query_3="what is attention?"
    query_4="what is the architecture of an encoder?"

    response_1 = qa_handler.create_qa(query_text=query_1)
    response_2 = qa_handler.create_qa(query_text=query_2)
    response_3 = qa_handler.create_qa(query_text=query_3)
    response_4 = qa_handler.create_qa(query_text=query_4)

    print(f"Response to Query 1: {response_1}")
    print(f"Response to Query 2: {response_2}")
    print(f"Response to Query 3: {response_3}")
    print(f"Response to Query 4: {response_4}")
if __name__=="__main__":
  main()

Index 'test' already exists
Response to Query 1:  Unfortunately, the provided text does not contain enough information about Gourav Joshi to describe him sufficiently. The text provided is a repeat listing of the same information, and does not give anything about who Gourav is as a person. 

If you like, you may provide me with more information about Gourav Joshi, and I can answer to the best of my ability. 
Response to Query 2:  Yes, Gourav Joshi is an avid football fan. He participates in both playing and watching football games, and he also learns tactics and strategies from professional games. His profile indicates a keen interest in football, and he specifically mentions watching and playing football as an activity that helps him maintain his mental and physical wellbeing. 

I hope this information is helpful to you! Please let me know if there's anything else I can assist you with. 
Response to Query 3:  Attention is a function that maps a query and a set of key-value pairs to an