In [1]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain import FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores import DeepLake
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from chromadb.utils import embedding_functions
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import chromadb
from chromadb.config import Settings
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI
from langchain import HuggingFaceHub

from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.schema import SystemMessage, HumanMessage, AIMessage

from langchain.llms import GPT4All
# from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from tqdm import tqdm
from langchain.output_parsers import PydanticOutputParser
from pydantic import validator, field_validator
from pydantic import BaseModel, Field
from typing import List
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
import cohere
from langchain.embeddings import CohereEmbeddings
from langchain_community.llms import Cohere
from langchain_community.chat_models import ChatCohere
from langchain.chains import SimpleSequentialChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_experimental.tools import PythonAstREPLTool
from langchain_experimental.utilities import PythonREPL
from langchain import LLMMathChain
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
import pandas as pd



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

True
1
0


In [None]:
import os
os.environ['OPENAI_API_KEY'] = ""
os.environ["ACTIVELOOP_TOKEN"] = ""
os.environ["GOOGLE_API_KEY"]= ""
os.environ["GOOGLE_CSE_ID"]= ""
os.environ["HUGGINGFACEHUB_API_TOKEN"]= ""
os.environ["COHERE_API_KEY"] = ""
os.environ["WOLFRAM_ALPHA_APPID"] = ""
os.environ["SERPAPI_API_KEY"]= ""

In [5]:
embed_ada =  OpenAIEmbeddings(model="text-embedding-ada-002")
chromadb_collection_name= "oepnai_ada_hr_chatbot_collection"
policy_doc_filepath= "/home/jupyter/self_learning/Langchain/code/hr_chatbot/data/hr_policy.txt"
emp_data_filepath= "/home/jupyter/self_learning/Langchain/code/hr_chatbot/data/employee_data.csv"
user = 'Alexander Verdad' # set user

In [6]:
# Define a function to load documents from a file
def load_policy_docs(filename):
    # Create an empty list to hold the documents
    docs = []
    try:
        # Load the file using the TextLoader class and UTF-8 encoding
        loader = TextLoader(filename, encoding='utf-8')
        # Split the loaded file into separate documents and add them to the list of documents
        docs.extend(loader.load_and_split())
    except Exception as e:
        # If an error occurs during loading, ignore it and return an empty list of documents
        pass
    # Return the list of documents
    return docs

In [7]:
def split_policy_docs(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20, length_function=len)
    return text_splitter.split_documents(docs) 

In [8]:
def load_vectors_into_db(documents):
    client = chromadb.Client()
    db = Chroma.from_documents(documents, embed_ada, client=client, collection_name=chromadb_collection_name)
    print("There are", db._collection.count(), "in the collection")
    return db

In [17]:
# Define the main function
client = chromadb.Client()
client.delete_collection(name=chromadb_collection_name)
def main():
    # Load the content from the policy file
    docs = load_policy_docs(policy_doc_filepath)
    # Split the content into individual documents
    docs = split_policy_docs(docs)
    # Create a chromadb database with the given embedding function
    oepnai_ada_hr_chatbot_db = load_vectors_into_db(docs)

# Call the main function if this script is being run as the main program
if __name__ == '__main__':
    main()

There are 74 in the collection


In [18]:
# client = chromadb.Client()
# collection = client.get_collection(name=chromadb_collection_name, embedding_function=embed_ada)
# print("There are", collection.count(), "in the collection")

In [19]:
# collection.get(include=['documents'])

In [20]:

# Load embeddings and DeepLake database
def load_embeddings_and_database(collection_name):
    client = chromadb.Client()
    db = Chroma(client = client, collection_name = collection_name, embedding_function=embed_ada)
#     db = client.get_collection(name=collection_name, embedding_function=embed_ada)
    print("There are", db._collection.count(), "in the collection")
    return db

In [21]:
def create_agent(db):
    
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.0)
    # initialize vectorstore retriever object
    timekeeping_policy = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

    df = pd.read_csv(emp_data_filepath) # load employee_data.csv as dataframe
    python = PythonAstREPLTool(locals={"df": df}) # set access of python_repl tool to the dataframe

    # create calculator tool
    calculator = LLMMathChain.from_llm(llm=llm, verbose=True)

    df_columns = df.columns.to_list() # print column names of df

    # prep the (tk policy) vectordb retriever, the python_repl(with df access) and langchain calculator as tools for the agent
    tools = [
        Tool(
            name = "Timekeeping Policies",
            func=timekeeping_policy.run,
            description="""
            Useful for when you need to answer questions about employee timekeeping policies.

            <user>: What is the policy on unused vacation leave?
            <assistant>: I need to check the timekeeping policies to answer this question.
            <assistant>: Action: Timekeeping Policies
            <assistant>: Action Input: Vacation Leave Policy - Unused Leave
            ...
            """
        ),
        Tool(
            name = "Employee Data",
            func=python.run,
            description = f"""
            Useful for when you need to answer questions about employee data stored in pandas dataframe 'df'. 
            Run python pandas operations on 'df' to help you get the right answer.
            'df' has the following columns: {df_columns}

            <user>: How many Sick Leave do I have left?
            <assistant>: df[df['name'] == '{user}']['sick_leave']
            <assistant>: You have n sick leaves left.              
            """
        ),
        Tool(
            name = "Calculator",
            func=calculator.run,
            description = f"""
            Useful when you need to do math operations or arithmetic.
            """
        )
    ]

    # change the value of the prefix argument in the initialize_agent function. This will overwrite the default prompt template of the zero shot agent type
    agent_kwargs = {'prefix': f'You are friendly HR assistant. You are tasked to assist the current user: {user} on questions related to HR. You have access to the following tools:'}


    # initialize the LLM agent
    agent = initialize_agent(tools, 
                             llm, 
                             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 
                             verbose=True, 
                             agent_kwargs=agent_kwargs
                             )
    return agent


In [22]:
# define q and a function for frontend
def get_response(user_input):
    db = load_embeddings_and_database(chromadb_collection_name)
    agent= create_agent(db)
    response = agent.run(user_input)
    return response

In [23]:
user_input= "What is my name and employee id?"
response= get_response(user_input)

There are 74 in the collection


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check the employee data to find the name and employee id of Alexander Verdad.
Action: Employee Data
Action Input: df[df['name'] == 'Alexander Verdad'][['employee_id', 'name']][0m
Observation: [33;1m[1;3m   employee_id              name
2      1005676  Alexander Verdad[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: Your name is Alexander Verdad and your employee id is 1005676.[0m

[1m> Finished chain.[0m


In [24]:
get_response('Are employees on probation allowed to have vacation leaves?')


There are 74 in the collection


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check the timekeeping policies to answer this question.
Action: Timekeeping Policies
Action Input: Vacation Leave Policy - Probationary Employees[0m
Observation: [36;1m[1;3mProbationary employees are not eligible for Vacation Leave according to the company's policy. They are only eligible for Sick Leave at a reduced rate and are not entitled to Vacation Leave or Service Incentive Leave during their probation period.[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: Probationary employees are not allowed to have vacation leaves.[0m

[1m> Finished chain.[0m


'Probationary employees are not allowed to have vacation leaves.'

In [25]:
get_response("How many vacation leaves do I have left and what is the policy on unused VLs?")


There are 74 in the collection


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check the employee data for Alexander Verdad to see how many vacation leaves he has left and then refer to the timekeeping policies for the policy on unused vacation leaves.
Action: Employee Data
Action Input: df[df['name'] == 'Alexander Verdad']['vacation_leave'][0m
Observation: [33;1m[1;3m2    45
Name: vacation_leave, dtype: int64[0m
Thought:[32;1m[1;3mNow I need to check the timekeeping policies for the policy on unused vacation leaves.
Action: Timekeeping Policies
Action Input: Vacation Leave Policy - Unused Leave[0m
Observation: [36;1m[1;3mUnused Vacation Leave can be carried over to the next year, but the total accumulated leave should not exceed 30 days. Any excess leave will be forfeited. Additionally, unused Vacation Leave can be encashed at the end of the year at the basic salary rate. The encashment rate is calculated as the basic salary divided by 30 days, multipli

'Alexander Verdad has 45 vacation leaves left. Unused vacation leave can be carried over to the next year, but the total accumulated leave should not exceed 30 days. Any excess leave will be forfeited. Unused vacation leave can also be encashed at the end of the year at the basic salary rate.'

In [29]:
get_response("How much will I be paid if I encash all my unused vacation leaves?")


There are 74 in the collection


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to calculate the amount based on the employee's basic pay and the number of unused vacation leaves.
Action: Employee Data
Action Input: df[df['name'] == 'Alexander Verdad'][['basic_pay_in_php', 'vacation_leave']][0m
Observation: [33;1m[1;3m   basic_pay_in_php  vacation_leave
2             15000              45[0m
Thought:[32;1m[1;3mNow I need to calculate the total amount for unused vacation leaves.
Action: Calculator
Action Input: 15000 * 45[0m

[1m> Entering new LLMMathChain chain...[0m
15000 * 45[32;1m[1;3m```text
15000 * 45
```
...numexpr.evaluate("15000 * 45")...
[0m
Answer: [33;1m[1;3m675000[0m
[1m> Finished chain.[0m

Observation: [38;5;200m[1;3mAnswer: 675000[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: If you encash all your unused vacation leaves, you will be paid 675,000 PHP.[0m

[1m> Finished chain.[0m


'If you encash all your unused vacation leaves, you will be paid 675,000 PHP.'