# Chatbot for custom documents

In [None]:
from PyPDF2 import PdfReader
from tqdm import tqdm
from docx import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from dotenv import load_dotenv

### Loading data from the document that needs to be embedded

In [None]:
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page].extract_text()
    return text

def read_docx(file_path):
    document = Document(file_path)
    text = ""
    for paragraph in document.paragraphs:
        text += paragraph.text
    return text

def read_docs_from_dir(directory):
    combinedtext = ""
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            combinedtext += read_pdf(file_path)
        elif filename.endswith(".doc") or filename.endswith(".docx"):
            file_path = os.path.join(directory, filename)
            combinedtext += read_docx(file_path)
    return combinedtext

In [None]:
def split(combinedtext):
    text_splitter = CharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 0,
        length_function = len,
    )
    texts = text_splitter.split_text(combinedtext)
    text_splitter = RecursiveCharacterTextSplitter(
        separators=[" ", ",", "\n"],
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len,
    )
    texts = text_splitter.create_documents(texts)
    return texts

In [None]:
load_dotenv()

In [None]:
combined_text = read_docs_from_dir("documents")

In [None]:
texts = split(combined_text)

In [None]:
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_documents(texts, embeddings)

In [None]:
docsearch.save_local("index")

In [None]:
docsearch = FAISS.load_local("index", embeddings)

In [None]:
from langchain.chains import  ConversationalRetrievalChain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.prompt import PromptTemplate
llm = ChatOpenAI(temperature=0, max_tokens=1000, model_name="gpt-3.5-turbo")

from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

### Custom Prompt for AI customer service agent

In [None]:
custom_template = """The following is a friendly conversation between a human and an AI. The AI agent acts as the customer service agent who will answer all the queries the human has about the company. Your name is tecbot. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Relevant pieces of previous conversation:
{chat_history}

(You do not need to use these pieces of information if not relevant)

Current conversation:
Human: {question}
AI:"""

In [None]:
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

In [None]:
qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=docsearch.as_retriever(), memory=memory)

In [None]:
question = "what are the products and services of the company"
result = qa({"question": question})
answer = result["answer"]
print(answer)

In [None]:
question = "can you elaborate the 7th point"
result = qa({"question": question})
answer = result["answer"]
print(answer)