# Get questions

In [1]:
file = open('questions.txt', 'r')  
file_contents = file.read()  
file.close()
questions = eval(file_contents)
questions

['How can I create a CDNAPSM Study case for the D-OTS?',
 'What is SP7?',
 'Who won the ellection?',
 'what is baps',
 'which quality codes can a value have?',
 'What are the corresponding numbers for each quality code?',
 'What are the numeric representations for input in the value?',
 'What information can you provide about the vau utility?',
 'What information can you provide about the "vau value utility"?',
 'How can I change an analog value using the "vau value utility"?',
 'How can I set a measured value to invalid using the "vau value utility"?',
 'How can I change the mark for a measured value using the "vau value utility"?',
 'What is Condense mode for a unit?',
 "What is the mode 'MRN' for units in OTS?",
 "What is the mode 'Must Run' for generators in TNA?",
 'in basidi, where can i find minimum and maximum values',
 'Which button should I click to access the minimum and maximum values in BASIDI?',
 'how to run updawipe ',
 'What is jROS?',
 'What are the key components of j

# Connect to Chroma 

In [2]:
import os
import chromadb
from chromadb.config import Settings
from tqdm import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import find_dotenv, load_dotenv
import pandas as pd
from langchain.chat_models import ChatOpenAI
from tqdm import tqdm
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import AzureChatOpenAI


# Load environment variables
_ = load_dotenv(
    find_dotenv(raise_error_if_not_found=False)
)


os.environ["OPENAI_API_TYPE"] = os.getenv("api_type")
os.environ["OPENAI_API_BASE"] = os.getenv("api_base")
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

azure_embeddings = OpenAIEmbeddings(
    deployment= "text-embedding-ada-002",
    chunk_size= 1, 
    openai_api_type='azure'
)

CHROMADB_IP = "localhost"
CHROMADB_PORT = "8000"
url_base = f"http://{CHROMADB_IP}:{CHROMADB_PORT}"


client = chromadb.HttpClient(
    host=CHROMADB_IP,
    port=CHROMADB_PORT,
    settings=Settings(allow_reset=True, anonymized_telemetry=False),
)
print("List all Collections: ")
print("--------------------")
for coll in client.list_collections():
    print(coll.name)


List all Collections: 
--------------------
questoins
20231010_3b807404-4554-5ea0-ba55-d0e35d46a3de
full_data
questions
confluence_SWRD
confluence_USGCSCD
main_SP7v2.30Q3
confluence_GDRDSP7
confluence_SIGCUI
Ordering_full_data
PI_questions
hypo_questions
AS_full_data
confluence_MTKA
confluence_GGU
PI_full_data
Scripts
PL_full_data
sharepoint_SP7
20231109_7c188e59-e156-5640-a731-b00a1523f3a3
confluence_SP7_all.30Q3
confluence_GCENGRGSP7
test
confluence_GCCOMM
TR_full_data
confluence_SP7MKTAOPS
AS_questions


In [3]:
model = AzureChatOpenAI(deployment_name= "gpt-35-turbo")

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [7]:
from typing import Any, Dict, List, Optional
from langchain_core.callbacks.manager import Callbacks
from langchain_core.retrievers import BaseRetriever
from langchain.vectorstores import Chroma
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.callbacks.manager import AsyncCallbackManagerForRetrieverRun
import multiprocessing
from langchain_core.documents import Document

from langchain_community.vectorstores.utils import maximal_marginal_relevance

import numpy as np


class CustomChroma(Chroma):
    """
    the crucial part is, I need a function which also returns the embeddings,
    such a function is not implemented by default or is private.
    """
    
    def set_collection(self,chroma_dictionary):
        try:
            self._client.delete_collection(name="temp")
        except:
            pass
        self._collection = self._client.create_collection("temp")
        self._collection.add(
            ids = chroma_dictionary["ids"],
            documents = chroma_dictionary["documents"],
            metadatas = chroma_dictionary["metadatas"],
            embeddings = chroma_dictionary["embeddings"]
        )

    def to_langchain_document(self, chroma_doc_list):
        lang_chain_docs = []
        for i in range(len(chroma_doc_list["documents"][0])):
            lang_chain_docs.append(
                Document(
                    page_content = chroma_doc_list["documents"][0][i],
                    metadata = chroma_doc_list["metadatas"][0][i]
                )
            )
        return lang_chain_docs
    


    def search_by_vector(self, query_vector, k_filter=20, k=5, where=None, where_document=None, include=["documents", "metadatas", "embeddings"]):
        #start = time.perf_counter()
        results = self._Chroma__query_collection(
            query_embeddings=query_vector,
            n_results=k_filter,
            where=where,
            where_document=where_document,
            include = include
        )
        mmr_selected = maximal_marginal_relevance(
            np.array(query_vector, dtype=np.float32),
            results["embeddings"][0],
            k=k,
            lambda_mult=0.5,
        )
        
        return {
            "ids": [np.array(results["ids"][0])[mmr_selected].tolist()],
            "embeddings": [np.array(results["embeddings"][0])[mmr_selected].tolist()],
            "documents": [np.array(results["documents"][0])[mmr_selected].tolist()],
            "metadatas": [np.array(results["metadatas"][0])[mmr_selected].tolist()]
        }



class GridChatRetriver(BaseRetriever):

    vectorstores: list
    embedding : Any
    k1 : int
    k2 : int
    k_filter : int

    def background_task(self, vectorstore, query_vector, k_filter, k2):
        return vectorstore.search_by_vector(query_vector, k_filter=k_filter, k=k2)


    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
 

        pool = multiprocessing.Pool()

        query_vector = self.embedding.embed_query(query)

        results = pool.starmap(self.background_task,  [(vec, query_vector, self.k_filter, self.k2) for vec in self.vectorstores ])
        pool.close()
        pool.join()

        metadatas = []
        ids = []
        text_chunks = []
        embeddings = []

        for result in results:
            text_chunks.extend(result["documents"][0])
            metadatas.extend(result["metadatas"][0])
            ids.extend(result["ids"][0])
            embeddings.extend(result["embeddings"][0])

        dictionary = {
            "ids" : ids,
            "metadatas" : metadatas,
            "documents" : text_chunks,
            "embeddings" : embeddings,
        }

        temp_vectorstore = CustomChroma(
            client = chromadb.Client()
        )
        temp_vectorstore.set_collection(dictionary)


        return temp_vectorstore.to_langchain_document(temp_vectorstore.search_by_vector(query_vector=query_vector,k=self.k1))
    
    async def _aget_relevant_documents(
        self,
        query: str,
        *,
        run_manager: AsyncCallbackManagerForRetrieverRun,
        **kwargs: Any,
    ) -> List[Document]:
        raise NotImplementedError()
    

## using GridChat retriver

In [8]:
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import RetrievalQA

collection_names = [
    "confluence_SWRD",
    "confluence_USGCSCD",
    "confluence_GDRDSP7",
    "confluence_SIGCUI",
    "confluence_MTKA",
    "confluence_GGU",
    "confluence_GCENGRGSP7",
    "PI_full_data",
    "AS_full_data",
    "PL_full_data",
    "TR_full_data",
    "confluence_GCCOMM",
    "confluence_SP7MKTAOPS"]


vectorstores = []
for name in collection_names:
    vectorstores.append(CustomChroma(
        client=client,
        collection_name=name)
    )


gridchat_retriver = GridChatRetriver(
    vectorstores=vectorstores,
    embedding=azure_embeddings,
    k_filter = 20, #how many vectors are selected using maximal marginal relecance
    k1 = 5, #how many selected at the end
    k2 = 2 #how many for every collection
)



# testing retriever

In [9]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

def create_context(prompt):
    docs = gridchat_retriver.get_relevant_documents(prompt)

    context = ""

    for doc in docs:
        context += doc.page_content + "\n\n\n"
    return context

def get_df():
    answers = []
    contexts = []
    for i,question in tqdm(enumerate(questions)):

        llm = LLMChain(llm=model, prompt=prompt)

        context = create_context(question)
        output = llm.apply(input_list=[{
            "context" : context,
            "question" : question
        }])
        contexts.append(context)
        answers.append(output[0]["text"])
        #print(i+1,"/", len(questions))

    data = {
        "question" : questions,
        "contexts" : contexts,
        "answer": answers
    }
    return pd.DataFrame(data)

output_df = get_df()
output_df.to_excel("default.xlsx")

53it [04:03,  4.59s/it]


## evaluating the Ragas metrics

In [None]:
from ragas.metrics import (
    context_relevancy,
    answer_relevancy,
    faithfulness,
)
from ragas.metrics.critique import harmfulness
from ragas import evaluate
from datasets import Dataset

metrics = [ faithfulness, answer_relevancy, context_relevancy, harmfulness]

gridchat_data =  Dataset.from_dict(output_df)
result = evaluate(
    gridchat_data,
    metrics=metrics,
    column_map = { "question": "question","contexts": "contexts","answer": "answer"}
)