# Employee Eval part 4:

This will evaluate the re-ranker and difference in answers.

In [1]:
#pip installing:
%pip install langchain
%pip install langchain_community
%pip install langchain_huggingface
%pip install langchain_pinecone
%pip install pinecone
%pip install pinecone-client
%pip install dotenv
%pip install streamlit
%pip install pymupdf
%pip install -qU langchain_community wikipedia
%pip install --upgrade --quiet langchain-text-splitters tiktoken
%pip install difflib
%pip install cohere
%pip install cohere


import os
import langchain #its giving module not found error
import langchain_community
import langchain_huggingface
import langchain_pinecone
import pinecone
import dotenv
import streamlit as st

# Additional Imports (loading document):
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter

#pinecone etc (storage of ducments):
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from uuid import uuid4

#hugging face etc (for generation):
from langchain_huggingface import HuggingFaceEndpoint
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables import RunnableLambda

#memory imports
#I used these documentations: https://python.langchain.com/v0.1/docs/use_cases/chatbots/memory_management/ , https://python.langchain.com/v0.1/docs/modules/memory/types/buffer/ , https://python.langchain.com/v0.1/docs/modules/memory/
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain
from langchain.chains import create_history_aware_retriever #new
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

#caching imports:
from difflib import SequenceMatcher

from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import TokenTextSplitter
#for timing the retrivals
import time

#for parsing:
import re

#for cohere:
import cohere


Collecting langchain_community
  Downloading langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.12 (from langchain_community)
  Downloading langchain-0.3.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain_community)
  Downloading langchain_core-0.3.25-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [2]:
# Replace with the API keys you need
HUGGINGFACE_API_KEY = ""
PINECONE_API_KEY = ""
COHERE_API_KEY = ""

env_content = f"""
HUGGINGFACE_API_KEY={HUGGINGFACE_API_KEY}
PINECONE_API_KEY={PINECONE_API_KEY}
COHERE_API_KEY={COHERE_API_KEY}
"""

with open(".env", "w") as file:
    file.write(env_content)

print("Environment variables are saved to .env file.")

dotenv.load_dotenv()

Environment variables are saved to .env file.


True

# Baseline Answers

In [3]:
class EmployeeChatBot:
    # TODO: To be implemented
    def __init__(self):
        #loading variables:
        self.combined_text = ""
        self.CHUNK_SIZE = 256
        self.CHUNK_OVERLAP = 0.50
        #storing variables:
        self.pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
        self.index_name = "employee-queries-db" #keep the name small
        self.embeddings = HuggingFaceEmbeddings()
        self.index = self.pc.Index(self.index_name) #Remember, i can do this because i have already once created this index, else create index first
        self.vector_store = PineconeVectorStore(index=self.index, embedding=self.embeddings)
        # generating variables
        self.retriever = self.vector_store.as_retriever( search_type="similarity_score_threshold", search_kwargs={"k": 3, "score_threshold": 0.5},) #tunable
        self.repo_id = "mistralai/Mistral-7B-Instruct-v0.3" #tunable
        self.llm = HuggingFaceEndpoint( repo_id=self.repo_id, temperature= 1, top_k= 50, huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_KEY') ) #tunable

        self.verbose = False #change this to see the explanations of how the LLM reached its conclusion

        #memory variables:
        self.memory_template = """You are a ambiguity clearer, your task is to examine the human question and check for any "he/she/it/they/them" ambiguities.
        return an updated human question fixing those ambiguities using the previous conversation context only.
        if there is not enought relevant context, RETURN HUMAN QUESTION AS IT IS
        YOUR ANSWER SHOULD BE A QUESTION WHICH ONLY CLARIFIES ANY AMBIGUITY IN human question by replacing it with their name
        RETURN IN FORMAT: New human question: (updated question)
        Previous conversation:
        {chat_history}

        human question: {question}
        New human question:
        """
        self.memory_prompt = PromptTemplate.from_template(self.memory_template)

        self.memory = ConversationBufferMemory(memory_key="chat_history")
        self.conversation = LLMChain(
            llm=self.llm,
            prompt=self.memory_prompt,
            verbose=False,
            memory=self.memory
        )
        #prompt variables
        self.Classifier_template = """
        You are a prompt classifier designed to classify questions from employees in an organization.
        classify the following question into "Relevant" or "Irrelevant", based on whether the query theme is of a question from an organization employee, the question could be about IT, HR, Finance or any other department
        Only answer from the specified classes and one word answers.

        Question: {question}
        Answer:
        """


        #Case 1:
        self.Employee_Template = """
          You are a highly knowledgeable and reflective chatbot designed to assist employees of an organization by answering their questions accurately and thoughtfully.
          Your goal is to provide well-reasoned and clear answers based on the provided context.

          Follow these steps to construct your response:
          1. **Understand the question**: Restate the question in simpler terms if necessary, ensuring you grasp the key aspects of what is being asked.
          2. **Analyze the context**: Examine the provided context and identify relevant information that applies to the question.
          3. **Evaluate implications**: Consider any potential rules, policies, or ethical considerations that could affect the answer.
          4. **Provide the answer**: Deliver a clear, concise, and actionable response based on your analysis.
          5. **Reflection**: Briefly explain your reasoning process to ensure transparency and to help the employee understand your conclusion.

          Examples:
          ---
          Context:
          "Employees are prohibited from accepting gifts valued over $50 from clients. If a gift exceeds this amount, it must be declined or reported to the ethics committee."

          Question:
          "One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets?"

          Answer:
          1. **Understand the question**: Can the employee accept free airline tickets won in a raffle at a client's event?
          2. **Analyze the context**: The policy prohibits accepting gifts over $50. Airline tickets are likely valued well over this limit and would need to be reported or declined.
          3. **Evaluate implications**: Accepting the tickets could violate the company's ethics policy, even if won in a raffle, as they are provided by a client.
          4. **Provide the answer**: No, you should not accept the tickets without first consulting the ethics committee to determine whether an exception applies.
          5. **Reflection**: I based my answer on the explicit policy regarding gift value limits and the need to maintain ethical boundaries with clients.
          ---
          Context:
          "Employees are allowed to attend client-sponsored events, such as dinners or conferences, provided the primary purpose is business-related and attendance has been pre-approved by their manager."

          Question:
          "A client has invited me to a dinner event to discuss our ongoing project. Do I need approval to attend?"

          Answer:
          1. **Understand the question**: Does the employee need prior approval to attend a client dinner for business purposes?
          2. **Analyze the context**: The policy states that attendance at client events requires pre-approval from the employee’s manager.
          3. **Evaluate implications**: While the event seems business-related, attending without prior approval could breach company protocol.
          4. **Provide the answer**: Yes, you need to get approval from your manager before attending the dinner.
          5. **Reflection**: My answer aligns with the policy, ensuring adherence to company guidelines while allowing participation in legitimate business activities.
          ---
          Context: {context}
          Question: {question}
          Answer:
      """

        self.Augment_Prompt_Template = """
            The following are the file names available in our database:
            HR:
            - Code-of-conduct
            - Compensation-Benefits-Guide
            - Employee-appraisal-form
            - Employee-Handbook
            - Employee-Termination-Policy
            - Health-and-Safety-Guidelines
            - Onboarding-Manual
            - Remote-Work-Policy

            IT:
            - Cybersecurity-for-Employees
            - System-Access-Control-Policy
            - Technology-Devices-Policy

            Finance:
            - Expense-Report

            Given the following query:
            {question}

            You are tasked with identifying and returning the names of the **two most relevant files**, separated by "and," that are most helpful for addressing the query.
            do NOT provide reasoning or add any other text, just the names of files
            """



        self.Classifier_prompt = PromptTemplate( template=self.Classifier_template, input_variables=["question"] )
        self.Employee_prompt = PromptTemplate(template=self.Employee_Template, input_variables=["context", "question"] )
        self.get_relevant_docs_prompt = PromptTemplate( template=self.Augment_Prompt_Template, input_variables=["question"] )

        #chain variables
        self.classifier_chain = ({"question": RunnablePassthrough()} | self.Classifier_prompt | self.llm  | StrOutputParser() )
        self.get_relevant_docs_chain = ({"question": RunnablePassthrough()} | self.get_relevant_docs_prompt | self.llm  | StrOutputParser() )
        self.Employee_chain = ({"context": self.retriever | self.format_docs,  "question": RunnablePassthrough()} | self.Employee_prompt | self.llm | StrOutputParser() )
        self.full_chain = {"Relevancy": self.classifier_chain, "question": lambda x: x["question"]} | RunnableLambda(self.route)


    #this function will add the given filepath (as a string) to the pinecone vector db after parsing it
    def AddFileToDB(self, docs_to_load):
      # [ADD LOADING AND PARSING AND CHUNKING PART HERE]
      combined_text = ""
      for doc in docs_to_load:
        loader = PyMuPDFLoader(doc)
        documents = loader.load()
        # print(documents)
        for page in documents:
          text = page.page_content
          if "contents" in text.lower():
            continue
          text = re.sub(r'\bPage\s+\d+\b', '', text, flags=re.IGNORECASE)
          text = re.sub(r'\n', '', text).strip() #removing all newlines
          # print(text)
          text = re.sub(r'[^\w\s.,?!:;\'\"()&-]', '', text)
          combined_text += text + " "
      combined_text = combined_text.strip()
      # print(combined_text)
      text_splitter = TokenTextSplitter(chunk_size=self.CHUNK_SIZE, chunk_overlap=int(self.CHUNK_SIZE*self.CHUNK_OVERLAP))
      texts = text_splitter.split_text(combined_text)
      docs = text_splitter.create_documents(texts)
      print(docs)
      if self.index_name not in self.pc.list_indexes().names():
        self.pc.create_index(  #tunable
          name=self.index_name,
          dimension=768,
          metric="cosine",
          spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
          )
        )
      embeddings = HuggingFaceEmbeddings()
      index = self.pc.Index(self.index_name)
      vector_store = PineconeVectorStore(index=index, embedding=embeddings)
      uuids = [str(uuid4()) for _ in range(len(docs))]
      vector_store.add_documents(documents=docs, ids=uuids)


    # TODO: To be implemented
    def generate(self, query):
        # print(f"Generating with system prompt: {self.Employee_Template}")
        relevant_docs = self.get_relevant_docs(query)
        query = query + " try to answer from " + relevant_docs
        print(f"Augmented Query is: {query}")
        query_response = self.full_chain.invoke({"question": query})
        return query_response

    #Implement as per paper 1 with self generated text and break down of question into subsequent parts
    def Augment_prompt(self, query):
      pass


    # So this is what i had a theory about
    def get_relevant_docs(self,query):
        augmented_prompt = self.get_relevant_docs_chain.invoke({"question": query})
        documents = [
            "Code-of-conduct", "Compensation-Benefits-Guide", "Employee-appraisal-form",
            "Employee-Handbook", "Employee-Termination-Policy", "Health-and-Safety-Guidelines",
            "Onboarding-Manual", "Remote-Work-Policy", "Cybersecurity-for-Employees",
            "System-Access-Control-Policy", "Technology-Devices-Policy", "Expense-Report"
        ]
        words = augmented_prompt.split()
        matches = [doc for doc in documents if doc in words]
        return ", ".join(matches[:2])


    #Helper functions:
    def format_docs(self, docs):
        return "\n\n".join([d.page_content for d in docs])


    def route(self, info):
        if "relevant" in info["Relevancy"].lower():
          # print("Question was relevant")
          return self.Employee_chain.invoke(info["question"])
        else:
          return "Your question was not relevant to our organization"



In [4]:
# All eval questions:

#Init
bot = EmployeeChatBot()

#Question1
question = "One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question2
question = "At City of Fond du Lac, what is the list of holidays that i can be compensated as working atleast 20 hours per week?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question3
question = "What are the details i have to add in \“Employee Information\” section of the Employee Appraisal Form for the University of Texas, and what is the Rating Key they have provided?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question4
question = "As per the Recruitment section of the employee handbook, what is my reward if i someone is recruited from my referral in a hard-to-fill role?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question5
question = "What are my duties as a supervisor, before the start date, when onboarding new employees at the university of houston?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  self.memory = ConversationBufferMemory(memory_key="chat_history")
  self.conversation = LLMChain(


Question is: 
One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets?
Augmented Query is: One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets? try to answer from Employee-Termination-Policy

Answer is: 
1. **Understand the question**: Is the employee allowed to accept free airline tickets from a client, despite it being part of a raffle?
      2. **Analyze the context**: The policy states that employees cannot accept gifts from clients, except under the Gift Prize Policy. Airline tickets are considered gifts and exceed the value threshold for acceptance without reporting.
      3. **Evaluate implications**: Accepting the tickets could violate the company's gift policy and raise ethical concerns.
      4. **Provide the answer**: No, the employee cannot accept the tickets without first reporting the potential gift to 

In [5]:
#Question6
question = "What are the productivity measures if i want to work remotely and are there any meetings i have to attend if i am working remotely?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question7
question = "While on the topic of cyber security, in what ways can i be exploited via Emails?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question8
question = "what are the user access control guidelines for system access control policy of the company?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question9
question = "What are the Unacceptable use scenarios of technology devices at workforce central?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question10
question = "how can i create expense report procurement card for Concur Travel and Expense System?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

Question is: 
What are the productivity measures if i want to work remotely and are there any meetings i have to attend if i am working remotely?
Augmented Query is: What are the productivity measures if i want to work remotely and are there any meetings i have to attend if i am working remotely? try to answer from Onboarding-Manual, Remote-Work-Policy

Answer is: 
1. **Understand the question**: The employee wants to know about the productivity measures and required meetings when working remotely, as well as if there are any specific policies regarding this in the Onboarding-Manual or Remote-Work-Policy documents.
      2. **Analyze the context**: From the provided context, we can see that remote working is allowed for office-based employees, but for a maximum of two consecutive weeks per year without prior approval. Remote employees must adhere to security guidelines, use a fast and secure internet connection, and choose a quiet workspace. They are required to check in with their tea

# ReRanked Answers

In [9]:
class EmployeeChatBot:
    # TODO: To be implemented
    def __init__(self):
        #loading variables:
        self.combined_text = ""
        self.CHUNK_SIZE = 256
        self.CHUNK_OVERLAP = 0.50
        #storing variables:
        self.pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
        self.index_name = "employee-queries-db" #keep the name small
        self.embeddings = HuggingFaceEmbeddings()
        self.index = self.pc.Index(self.index_name) #Remember, i can do this because i have already once created this index, else create index first
        self.vector_store = PineconeVectorStore(index=self.index, embedding=self.embeddings)
        # generating variables
        self.repo_id = "mistralai/Mistral-7B-Instruct-v0.3" #tunable
        self.llm = HuggingFaceEndpoint( repo_id=self.repo_id, temperature= 1, top_k= 50, huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_KEY') ) #tunable
        self.verbose = True #change this to see the explanations of how the LLM reached its conclusion

        self.Cohere_client = cohere.Client(api_key=os.environ.get("COHERE_API_KEY"))

        #Case 4: (chain of thought with few shot examples)
        self.Employee_Template = """
            You are a highly knowledgeable and reflective chatbot designed to assist employees of an organization by answering their questions accurately and thoughtfully.
            Your goal is to provide well-reasoned and clear answers based on the provided context.

            Follow these steps to construct your response:
            1. **Understand the question**: Restate the question in simpler terms if necessary, ensuring you grasp the key aspects of what is being asked.
            2. **Analyze the context**: Examine the provided context and identify relevant information that applies to the question.
            3. **Evaluate implications**: Consider any potential rules, policies, or ethical considerations that could affect the answer.
            4. **Provide the answer**: Deliver a clear, concise, and actionable response based on your analysis.
            5. **Reflection**: Briefly explain your reasoning process to ensure transparency and to help the employee understand your conclusion.

            Examples:
            ---
            Context:
            "Employees are prohibited from accepting gifts valued over $50 from clients. If a gift exceeds this amount, it must be declined or reported to the ethics committee."

            Question:
            "One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets?"

            Answer:
            1. **Understand the question**: Can the employee accept free airline tickets won in a raffle at a client's event?
            2. **Analyze the context**: The policy prohibits accepting gifts over $50. Airline tickets are likely valued well over this limit and would need to be reported or declined.
            3. **Evaluate implications**: Accepting the tickets could violate the company's ethics policy, even if won in a raffle, as they are provided by a client.
            4. **Provide the answer**: No, you should not accept the tickets without first consulting the ethics committee to determine whether an exception applies.
            5. **Reflection**: I based my answer on the explicit policy regarding gift value limits and the need to maintain ethical boundaries with clients.
            ---
            Context:
            "Employees are allowed to attend client-sponsored events, such as dinners or conferences, provided the primary purpose is business-related and attendance has been pre-approved by their manager."

            Question:
            "A client has invited me to a dinner event to discuss our ongoing project. Do I need approval to attend?"

            Answer:
            1. **Understand the question**: Does the employee need prior approval to attend a client dinner for business purposes?
            2. **Analyze the context**: The policy states that attendance at client events requires pre-approval from the employee’s manager.
            3. **Evaluate implications**: While the event seems business-related, attending without prior approval could breach company protocol.
            4. **Provide the answer**: Yes, you need to get approval from your manager before attending the dinner.
            5. **Reflection**: My answer aligns with the policy, ensuring adherence to company guidelines while allowing participation in legitimate business activities.
            ---
            {question}
            Answer:
        """

        self.Augment_Prompt_Template = """
            The following are the file names available in our database:
            HR:
            - Code-of-conduct
            - Compensation-Benefits-Guide
            - Employee-appraisal-form
            - Employee-Handbook
            - Employee-Termination-Policy
            - Health-and-Safety-Guidelines
            - Onboarding-Manual
            - Remote-Work-Policy

            IT:
            - Cybersecurity-for-Employees
            - System-Access-Control-Policy
            - Technology-Devices-Policy

            Finance:
            - Expense-Report

            Given the following query:
            {question}

            You are tasked with identifying and returning the names of the **two most relevant files**, separated by "and," that are most helpful for addressing the query.
            do NOT provide reasoning or add any other text, just the names of files
            """

        self.Employee_prompt = PromptTemplate(template=self.Employee_Template, input_variables=["context", "question"] )
        self.get_relevant_docs_prompt = PromptTemplate( template=self.Augment_Prompt_Template, input_variables=["question"] )

        #chain variables
        self.get_relevant_docs_chain = ({"question": RunnablePassthrough()} | self.get_relevant_docs_prompt | self.llm  | StrOutputParser() )
        self.Employee_chain = ({"question": RunnablePassthrough()} | self.Employee_prompt | self.llm | StrOutputParser() )


    #this function will add the given filepath (as a string) to the pinecone vector db after parsing it
    def AddFileToDB(self, docs_to_load):
      # [ADD LOADING AND PARSING AND CHUNKING PART HERE]
      combined_text = ""
      for doc in docs_to_load:
        loader = PyMuPDFLoader(doc)
        documents = loader.load()
        # print(documents)
        for page in documents:
          text = page.page_content
          if "contents" in text.lower():
            continue
          text = re.sub(r'\bPage\s+\d+\b', '', text, flags=re.IGNORECASE)
          text = re.sub(r'\n', '', text).strip() #removing all newlines
          # print(text)
          text = re.sub(r'[^\w\s.,?!:;\'\"()&-]', '', text)
          combined_text += text + " "
      combined_text = combined_text.strip()
      # print(combined_text)
      text_splitter = TokenTextSplitter(chunk_size=self.CHUNK_SIZE, chunk_overlap=int(self.CHUNK_SIZE*self.CHUNK_OVERLAP))
      texts = text_splitter.split_text(combined_text)
      docs = text_splitter.create_documents(texts)
      print(docs)
      if self.index_name not in self.pc.list_indexes().names():
        self.pc.create_index(  #tunable
          name=self.index_name,
          dimension=768,
          metric="cosine",
          spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
          )
        )
      embeddings = HuggingFaceEmbeddings()
      index = self.pc.Index(self.index_name)
      vector_store = PineconeVectorStore(index=index, embedding=embeddings)
      uuids = [str(uuid4()) for _ in range(len(docs))]
      vector_store.add_documents(documents=docs, ids=uuids)


    # TODO: To be implemented
    def generate(self, query):
        relevant_docs = self.get_relevant_docs(query)
        search_query = query + " try to answer from " + relevant_docs

        retrieved_docs = self.format_docs_rerank(self.vector_store.similarity_search(search_query))
        # print("retrieved docs are: ", retrieved_docs)
        reranked_docs = self.rerank(query, retrieved_docs)
        # print("reranked docs are: ", reranked_docs)

        context = self.reformat_docs(reranked_docs)

        print("\n\nAnd now the re-ranked final context is: ", context)

        contextualised_query = "Context: \n" + context + "\n Question: \n" + query

        query_response = self.Employee_chain.invoke({"question": contextualised_query}) #figure out a way to invoke using the retrieved documents.... we dont actually need to have the route functionality, as i will add a final guardrail call at the end.

        match = re.search(r"\*\*Provide the answer\*\*: (.*?)(?:\n|$)", query_response)
        if self.verbose:
          return query_response
        else:
          return match.group(1) if match else ""

    #Implement as per paper 1 with self generated text and break down of question into subsequent parts
    def Augment_prompt(self, query):
      pass


    def rerank(self, query, chunks):
        """
        Reranks chunks based on relevance to the query using Cohere's re-rank endpoint.

        :param query: The query string
        :param chunks: A list of chunk strings to rerank
        :return: A list of tuples (chunk, score) sorted by score in descending order
        """
        print("The raw chunks received are: ")
        for chunk in chunks:
          print(chunk)
        responses = self.Cohere_client.rerank(
            query=query,
            documents=chunks,
            top_n=len(chunks)  # Return scores for all chunks
        )

        # print("[IN RERANK] responses are: ", responses)

        # Sort the chunks by their relevance scores
        # Extract the indexes based on relevance score
        relevant_indexes = [item.index for item in responses.results]
        # Return the chunks at the relevant indexes
        return [chunks[i] for i in relevant_indexes][:2]


    # So this is what i had a theory about
    def get_relevant_docs(self,query):
        augmented_prompt = self.get_relevant_docs_chain.invoke({"question": query})
        documents = [
            "Code-of-conduct", "Compensation-Benefits-Guide", "Employee-appraisal-form",
            "Employee-Handbook", "Employee-Termination-Policy", "Health-and-Safety-Guidelines",
            "Onboarding-Manual", "Remote-Work-Policy", "Cybersecurity-for-Employees",
            "System-Access-Control-Policy", "Technology-Devices-Policy", "Expense-Report"
        ]
        words = augmented_prompt.split()
        matches = [doc for doc in documents if doc in words]
        return ", ".join(matches[:2])



    #Helper functions:
    def format_docs(self, docs):
        return "\n\n".join([d.page_content for d in docs])

    def format_docs_rerank(self, docs):
      return [d.page_content for d in docs]

    def reformat_docs(self, docs):
      return "\n\n".join([d for d in docs])



In [10]:
# All eval questions:

#Init
bot = EmployeeChatBot()

#Question1
question = "One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question2
question = "At City of Fond du Lac, what is the list of holidays that i can be compensated as working atleast 20 hours per week?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question3
question = "What are the details i have to add in \“Employee Information\” section of the Employee Appraisal Form for the University of Texas, and what is the Rating Key they have provided?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question4
question = "As per the Recruitment section of the employee handbook, what is my reward if i someone is recruited from my referral in a hard-to-fill role?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question5
question = "What are my duties as a supervisor, before the start date, when onboarding new employees at the university of houston?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)



Question is: 
One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If I win, can I accept the tickets?
The raw chunks received are: 
arios:  Q. One of Comerica's clients is hosting an open house that includes a raffle for some free airline tickets. If you win, can you accept the tickets?  A. No.  You must not accept giftsprizes from any person or entity that does business with Comerica, except as permitted by Comericas GiftPrize Policy.  Giftsprizes include, but are not limited to:   Favors, gratuities, or services  Discount or price concessions  Inheritances or loans made on preferential terms  Fees, compensation, securities, real property, or anything else of value, whether or not a skill was involved in winning the prize (e.g., low golf score)  If you do receive unsolicited giftsprizes of this nature that are impermissible under the GiftPrize Policy, you must either inform the party that, per Comerica's policy, you are unable to ac

In [11]:
#Question6
question = "What are the productivity measures if i want to work remotely and are there any meetings i have to attend if i am working remotely?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question7
question = "While on the topic of cyber security, in what ways can i be exploited via Emails?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question8
question = "what are the user access control guidelines for system access control policy of the company?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question9
question = "What are the Unacceptable use scenarios of technology devices at workforce central?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

#Question10
question = "how can i create expense report procurement card for Concur Travel and Expense System?"
print("Question is: \n" + question)
answer = bot.generate(question)
print("\nAnswer is: \n" + answer)

Question is: 
What are the productivity measures if i want to work remotely and are there any meetings i have to attend if i am working remotely?
The raw chunks received are: 
            course corrections where needed.     10 Vice President Meetings Vice Presidents (VPs) are permitted to attend all meetings in person or virtually. VPs are               exempt from attending meetings when on approved PTO.  Weekly CEO Meeting Comp Time may not be taken during the weekly scheduled CEO meeting. Weekly All Staff Meeting Comp Time may not be taken during the weekly scheduled All Staff meeting.  Project Report Each week, all remote employees are required to submit a Project Report to their              supervisor. The report should outline: what youre working on, project updates, questions            andor approvals needed from the supervisor, and just in general things, the supervisor             needs to know. The report should be sent to the supervisor prior to your weekly meeting.      