In [1]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [8]:
os.environ["OPENAI_API_KEY"] = "API"

In [9]:
# Function to extract text from PDF files
def get_text_from_pdf(pdf_files):
    text = ""
    for pdf_file in pdf_files:
        reader = PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text()
    return text

In [16]:
#  for Chunking
def chunk_text(raw_text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(raw_text)

In [17]:
#  for Embedding
def get_vectorstore(chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)
    return vectorstore

In [18]:
# Model
def get_conversation_chain(vectorstore):
    llm = ChatOpenAI()
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    convo_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return convo_chain

In [19]:
# Function to handle user input and chat
def chat_user_input(user_query, convo_chain):
    response = convo_chain({"question": user_query})
    return response

In [28]:
# Main function
def main():
    print("Initializing chatbot...")

    # PDF files
    pdf_dir = "/Users/spartan/Documents/Sem 3 MSDA/Deep Learning/Assignments/Deep_Learning_CourseWork/HW_12/PDFs"  # Change to your directory
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

    if not pdf_files:
        print("No PDF files found in the directory.")
        return

    # processing the data
    print("Loading and processing PDF data...")
    pdf_content = get_text_from_pdf(pdf_files)
    chunks = chunk_text(pdf_content)
    vectorstore = get_vectorstore(chunks)

    # Initialize conversation with bot
    convo_chain = get_conversation_chain(vectorstore)

    print("Chatbot is ready. Ask your questions about the uploaded data. Type 'quit' to exit.")
    conversation_summary = []

    while True:
        user_query = input("You: ")
        if user_query.lower() == "quit":
            break

        response = chat_user_input(user_query, convo_chain)
        chatbot_reply = response['answer']
        conversation_history = response['chat_history']

        # Memory for summary
        conversation_summary.append({"user": user_query, "bot": chatbot_reply})

        print(f"ChatBot: {chatbot_reply}")

    # conversation SUmmary
    print("\nConversation Summary:")
    for i, entry in enumerate(conversation_summary, 1):
        print(f"{i}. You: {entry['user']}\n   ChatBot: {entry['bot']}\n")

if __name__ == "__main__":
    main()

Initializing chatbot...
Loading and processing PDF data...
Chatbot is ready. Ask your questions about the uploaded data. Type 'quit' to exit.
You: What is NLP?
ChatBot: Natural Language Processing (NLP) is a sub-field of AI that uses computational techniques to analyze and represent natural language texts or speech for tasks like understanding, generation, and human-computer interaction.
You: What are RNNs used for?
ChatBot: RNNs are used for sequence data tasks such as language modeling, conversation modeling, image/video captioning, and time series prediction.
You: quit

Conversation Summary:
1. You: What is NLP?
ChatBot: Natural Language Processing (NLP) is a sub-field of AI that uses computational techniques to analyze and represent natural language texts or speech for tasks like understanding, generation, and human-computer interaction.
2. What are RNNs used for?
ChatBot: RNNs are used for sequence data tasks such as language modeling, conversation modeling, image/video captioning