In [None]:
import os
import re
import numpy as np
!pip install faiss-cpu
from nltk.tokenize import word_tokenize
!pip install -U langchain_community
!pip install PyPDFLoader
!pip install pypdf

from langchain_community.chat_models import ChatHuggingFace

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.llms import HuggingFaceHub
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

class DocumentProcessor:
    def __init__(self, chunk_size=500, chunk_overlap=50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        self.embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

    def load_document(self, file_path):
        """Load document based on file extension."""
        try:
            if file_path.lower().endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                return loader.load()
            elif file_path.lower().endswith('.txt'):
                loader = TextLoader(file_path)
                return loader.load()
            else:
                print(f"Unsupported file type: {file_path}")
                return []
        except Exception as e:
            print(f"Error loading document {file_path}: {e}")
            return []

    def process_documents(self, file_paths):
        """Process multiple documents and create a vector store."""
        all_docs = []
        for file_path in file_paths:
            docs = self.load_document(file_path)
            if docs:
                print(f"Loaded {len(docs)} pages/sections from {file_path}")
                all_docs.extend(docs)

        if not all_docs:
            print("No documents were successfully loaded.")
            return None

        chunks = self.text_splitter.split_documents(all_docs)
        print(f"Split into {len(chunks)} chunks for indexing")


        vector_store = FAISS.from_documents(chunks, self.embedding_model)
        return vector_store

#

class RAGChatbot:
import os
import re
import numpy as np
!pip install faiss-cpu
from nltk.tokenize import word_tokenize
!pip install -U langchain_community
!pip install PyPDFLoader
!pip install pypdf

from langchain_community.chat_models import ChatHuggingFace

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.llms import HuggingFaceHub
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

class DocumentProcessor:
    def __init__(self, chunk_size=500, chunk_overlap=50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        self.embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

    def load_document(self, file_path):
        """Load document based on file extension."""
        try:
            if file_path.lower().endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                return loader.load()
            elif file_path.lower().endswith('.txt'):
                loader = TextLoader(file_path)
                return loader.load()
            else:
                print(f"Unsupported file type: {file_path}")
                return []
        except Exception as e:
            print(f"Error loading document {file_path}: {e}")
            return []

    def process_documents(self, file_paths):
        """Process multiple documents and create a vector store."""
        all_docs = []
        for file_path in file_paths:
            docs = self.load_document(file_path)
            if docs:
                print(f"Loaded {len(docs)} pages/sections from {file_path}")
                all_docs.extend(docs)

        if not all_docs:
            print("No documents were successfully loaded.")
            return None

        chunks = self.text_splitter.split_documents(all_docs)
        print(f"Split into {len(chunks)} chunks for indexing")


        vector_store = FAISS.from_documents(chunks, self.embedding_model)
        return vector_store

#

class RAGChatbot:
    def __init__(self, vector_store, model_id="HuggingFaceH4/zephyr-7b-beta"):
        """Initialize the RAG chatbot."""
        self.memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True,
            output_key="answer"
        )
        self.vector_store = vector_store
        self.llm = HuggingFaceHub(repo_id=model_id, model_kwargs={"temperature":0.5, "max_new_tokens":512})
        self.memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True,
            output_key="answer"
        )
        self.qa_chain = ConversationalRetrievalChain.from_llm(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            memory=self.memory,
            return_source_documents=True,
            get_chat_history=lambda h: h   )

    def chat(self, query):
        """Process a user query and return a response."""
        if not query.strip():
            return "I didn't catch that. Could you please rephrase your question?"

        try:
            result = self.qa_chain({"question": query})
            return result["answer"]
        except Exception as e:
            print(f"Error during query processing: {e}")
            return "I encountered an error while processing your question. Please try again."
def main():
    """Main function to run the chatbot."""
    print("\n" + "="*50)
    print("Welcome to Document RAG Chatbot!")
    print("="*50)

    file_paths = []
    while True:
        file_path = input("\nEnter path to a document (or 'done' to finish adding): ")
        if file_path.lower() == 'done':
            break
        if os.path.exists(file_path):
            file_paths.append(file_path)
        else:
            print(f"File not found: {file_path}")

    if not file_paths:
        print("No valid documents provided. Exiting...")
        return

    print("\nProcessing documents...")
    processor = DocumentProcessor()
    vector_store = processor.process_documents(file_paths)

    if vector_store is None:
        print("Failed to create vector store. Exiting...")
        return

    print("\nInitializing chatbot...")
    chatbot = RAGChatbot(vector_store)

    print("\n" + "-"*50)
    print("Chatbot: Hi there! I'm your document assistant. Ask me anything about the documents you've uploaded.")
    print("Type 'Hey Bot' to restart or 'Good-Bye' to exit.")
    print("-"*50)

    while True:
        user_input = input("\nYou: ").strip()

        if user_input.lower() == 'good-bye':
            print("\nChatbot: Thank you for chatting with me. Have a great day!")
            break
        elif user_input.lower() == 'hey bot':
            print("\nChatbot: Hi there! I'm resetting our conversation. What would you like to know about your documents?")
            chatbot.memory.clear()
            continue

        response = chatbot.chat(user_input)
        print(f"\nChatbot: {response}")

if __name__ == "__main__":
    main()
    output_key="answer"


        model_kwargs={
    "temperature": 0.7,
    "max_new_tokens": 512,
    "repetition_penalty": 1.1
}
        self.vector_store = vector_store
        self.llm = HuggingFaceHub(repo_id=model_id, model_kwargs={"temperature":0.5, "max_new_tokens":512})
        self.memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True,
            output_key="answer"
        )


        model_kwargs={
            "temperature": 0.7,
            "max_new_tokens": 512,
            "repetition_penalty": 1.1
        }


        self.qa_chain = ConversationalRetrievalChain.from_llm(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            memory=self.memory,
            return_source_documents=True,
            get_chat_history=lambda h: h   )

    def chat(self, query):
        """Process a user query and return a response."""
        if not query.strip():
            return "I didn't catch that. Could you please rephrase your question?"

        try:
            result = self.qa_chain({"question": query})
            return result["answer"]
        except Exception as e:
            print(f"Error during query processing: {e}")
            return "I encountered an error while processing your question. Please try again."
def main():
    """Main function to run the chatbot."""
    print("\n" + "="*50)
    print("Welcome to Document RAG Chatbot!")
    print("="*50)

    file_paths = []
    while True:
        file_path = input("\nEnter path to a document (or 'done' to finish adding): ")
        if file_path.lower() == 'done':
            break
        if os.path.exists(file_path):
            file_paths.append(file_path)
        else:
            print(f"File not found: {file_path}")

    if not file_paths:
        print("No valid documents provided. Exiting...")
        return

    print("\nProcessing documents...")
    processor = DocumentProcessor()
    vector_store = processor.process_documents(file_paths)

    if vector_store is None:
        print("Failed to create vector store. Exiting...")
        return

    print("\nInitializing chatbot...")
    chatbot = RAGChatbot(vector_store)

    print("\n" + "-"*50)
    print("Chatbot: Hi there! I'm your document assistant. Ask me anything about the documents you've uploaded.")
    print("Type 'Hey Bot' to restart or 'Good-Bye' to exit.")
    print("-"*50)

    while True:
        user_input = input("\nYou: ").strip()

        if user_input.lower() == 'good-bye':
            print("\nChatbot: Thank you for chatting with me. Have a great day!")
            break
        elif user_input.lower() == 'hey bot':
            print("\nChatbot: Hi there! I'm resetting our conversation. What would you like to know about your documents?")
            chatbot.memory.clear()
            continue

        response = chatbot.chat(user_input)
        print(f"\nChatbot: {response}")

if __name__ == "__main__":
    main()

IndentationError: expected an indented block after class definition on line 69 (<ipython-input-4-ab219d21cc3d>, line 70)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
