In [None]:
# %pip install typing-inspect==0.8.0
# %pip install typing-extensions==4.5.0
%pip install chromadb==0.4.15
%pip install langchain==0.0.332
%pip install sentence_transformers

In [1]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, CohereEmbeddings

from pydantic import BaseModel, Field

from typing import List

from sentence_transformers import SentenceTransformer

from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [3]:
import json
def generate_document(api_doc_path: str, api_example_path: str, api_name: str):
    with open(api_doc_path, 'r') as f:
        data = json.load(f)
    
    doc_format = ''
    for i in data['ToolList']:
        if i['API Name'] == api_name:
            doc_format += f"##API Name: {i['API Name']} \n"
            doc_format += f"###Description: {i['API Description']}\n\n"
            doc_format += f'###Arguments: \n\n'
            for j in i['API arguments']:
                doc_format += f"API Argumet: {j['Argument Name']}\n"
                doc_format += f"Argument Description: {j['Argument Description']}\n"
                doc_format += f"Return Type: {j['Argument Type']}\n"
                # doc_format += f"Value Examples: {j['Argument Value Examples']}\n"
                
    # with open(api_example_path, 'r') as f:
    #     ex_data = json.load(f)
    
    # ex_format = '\n\nExamples:\n'
    # for query in ex_data:
    #     if api_name in query['Output']:
    #         ex_format += f"###Query: {query['Query']}\n"
    #         ex_format += f"###Output: {query['Output']}\n"
    # return doc_format + ex_format
    
    return doc_format

In [4]:
api_list = []
with open('../api_documentation.json', 'r') as f:
    data = json.load(f)
for i in data['ToolList']:
    api_list.append(i['API Name'])

In [5]:
api_list

['works_list',
 'summarize_objects',
 'prioritize_objects',
 'add_work_items_to_sprint',
 'get_sprint_id',
 'get_similar_work_items',
 'search_object_by_name',
 'create_actionable_tasks_from_text',
 'who_am_i']

In [6]:
doc = []
meta = []
id = []
for itr,i in enumerate(api_list):
    api = {}
    doc.append(generate_document('../api_documentation.json', '../examples.json', i))
    api["API"] = i
    meta.append(api)
    id.append(f"ID{itr}")
print((doc))
print((meta))
print(id)

['##API Name: works_list \n###Description: Returns a list of work items matching the request\n\n###Arguments: \n\nAPI Argumet: applies_to_part\nArgument Description: Filters for work belonging to any of the provided parts\nReturn Type: array of strings\nAPI Argumet: created_by\nArgument Description: Filters for work created by any of these users\nReturn Type: array of strings\nAPI Argumet: issue.priority\nArgument Description: Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2, p3\nReturn Type: array of strings\nAPI Argumet: issue.rev_orgs\nArgument Description: Filters for issues with any of the provided Rev organizations\nReturn Type: array of strings\nAPI Argumet: limit\nArgument Description: The maximum number of works to return. The default is 50\nReturn Type: integer(int32)\nAPI Argumet: owned_by\nArgument Description: Filters for work owned by any of these users\nReturn Type: array of strings\nAPI Argumet: stage.name\nArgument Description: Filters

In [7]:
doc

['##API Name: works_list \n###Description: Returns a list of work items matching the request\n\n###Arguments: \n\nAPI Argumet: applies_to_part\nArgument Description: Filters for work belonging to any of the provided parts\nReturn Type: array of strings\nAPI Argumet: created_by\nArgument Description: Filters for work created by any of these users\nReturn Type: array of strings\nAPI Argumet: issue.priority\nArgument Description: Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2, p3\nReturn Type: array of strings\nAPI Argumet: issue.rev_orgs\nArgument Description: Filters for issues with any of the provided Rev organizations\nReturn Type: array of strings\nAPI Argumet: limit\nArgument Description: The maximum number of works to return. The default is 50\nReturn Type: integer(int32)\nAPI Argumet: owned_by\nArgument Description: Filters for work owned by any of these users\nReturn Type: array of strings\nAPI Argumet: stage.name\nArgument Description: Filters

In [20]:
#OPENAI_API_KEY = "sk-cUPBnoiiUuE5FaRvhfbCT3BlbkFJwO5XRqiSY4VSGks2ccL1F"

In [9]:
class LineList(BaseModel):
    lines: List[str] = Field(description="Lines of text")

class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)

output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    # template = "Repeat the word apple two times, like, \napple\napple",
    template="""You are a instructor your job is to break a query into smaller parts and provide it to worker. Given a conversation utterance by a user, ignore all the non-query part and try to break the main query into smaller steps. Don't include multiple steps, just whatever the query is trying to address. Output only the sub queries step by step and nothing else.
    Original question: {question}""",
)
llm = ChatOpenAI(temperature=0)

llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

#### HuggingFace

In [10]:
client_hf = chromadb.PersistentClient(path="./hf_db")

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="BAAI/bge-base-en-v1.5")

collection_hf = client_hf.get_or_create_collection(name="hf_check_1", embedding_function = sentence_transformer_ef)

In [11]:
collection_hf.add(
    documents=doc,
    metadatas=meta,
    ids=id
)



In [None]:
print(collection_hf.count())
collection_hf.peek()

In [12]:
embeddings_hf = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5"
)

vectorstore_hf = Chroma(
    collection_name="hf_check_1",
    embedding_function=embeddings_hf,
    persist_directory = "./hf_db"
)

In [13]:
retriever_from_llm = MultiQueryRetriever(
    retriever=vectorstore_hf.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)

unique_docs_hf = retriever_from_llm.get_relevant_documents(
    query="Summarize the work items owned by user \"Bob\" with low severity, then search for similar work items and prioritize them."
)

print(len(unique_docs_hf))
unique_docs_hf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
INFO:langchain.retrievers.multi_query:Generated queries: ['Sub query 1: Summarize the work items owned by user "Bob" with low severity.', 'Sub query 2: Search for similar work items.', 'Sub query 3: Prioritize the similar work items.']


6


[Document(page_content='##API Name: works_list \n###Description: Returns a list of work items matching the request\n\n###Arguments: \n\nAPI Argumet: applies_to_part\nArgument Description: Filters for work belonging to any of the provided parts\nReturn Type: array of strings\nAPI Argumet: created_by\nArgument Description: Filters for work created by any of these users\nReturn Type: array of strings\nAPI Argumet: issue.priority\nArgument Description: Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2, p3\nReturn Type: array of strings\nAPI Argumet: issue.rev_orgs\nArgument Description: Filters for issues with any of the provided Rev organizations\nReturn Type: array of strings\nAPI Argumet: limit\nArgument Description: The maximum number of works to return. The default is 50\nReturn Type: integer(int32)\nAPI Argumet: owned_by\nArgument Description: Filters for work owned by any of these users\nReturn Type: array of strings\nAPI Argumet: stage.name\nArgumen

In [21]:
print(unique_docs_hf[0].page_content)

##API Name: works_list 
###Description: Returns a list of work items matching the request

###Arguments: 

API Argumet: applies_to_part
Argument Description: Filters for work belonging to any of the provided parts
Return Type: array of strings
API Argumet: created_by
Argument Description: Filters for work created by any of these users
Return Type: array of strings
API Argumet: issue.priority
Argument Description: Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2, p3
Return Type: array of strings
API Argumet: issue.rev_orgs
Argument Description: Filters for issues with any of the provided Rev organizations
Return Type: array of strings
API Argumet: limit
Argument Description: The maximum number of works to return. The default is 50
Return Type: integer(int32)
API Argumet: owned_by
Argument Description: Filters for work owned by any of these users
Return Type: array of strings
API Argumet: stage.name
Argument Description: Filters for records in the provid

#### OpenAI

In [16]:
client_openai = chromadb.PersistentClient(path="/home/navcore/Downloads/InterIIT/openai_db")

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-ada-002"
            )

collection_openai = client_openai.get_or_create_collection(name="openai_check_1", embedding_function = openai_ef)

In [17]:
collection_openai.add(
    documents=doc,
    metadatas=meta,
    ids=id
)

In [None]:
print(collection_openai.count())
collection_openai.peek()

In [19]:
openai_embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002", 
    openai_api_key=OPENAI_API_KEY, 
    disallowed_special=()
    )

vectorstore_openai = Chroma(
    collection_name="openai_check_1",
    embedding_function=openai_embeddings,
    persist_directory = "/home/navcore/Downloads/InterIIT/openai_db"
)

In [20]:
retriever_from_llm = MultiQueryRetriever(
    retriever=vectorstore_openai.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)

unique_docs_openai = retriever_from_llm.get_relevant_documents(
    query="Summarize the work items owned by user \"Bob\" with low severity, then search for similar work items and prioritize them."
)

print(len(unique_docs_openai))
unique_docs_openai

INFO:langchain.retrievers.multi_query:Generated queries: ['Sub query 1: Summarize the work items owned by user "Bob" with low severity.', 'Sub query 2: Search for similar work items.', 'Sub query 3: Prioritize the similar work items.']


6


[Document(page_content='##API Name: summarize_objects \n###Description: Summarizes a list of objects. The logic of how to summarize a particular object type is an internal implementation detail.\n\n###Arguments: \n\nAPI Argumet: objects\nArgument Description: List of objects to summarize\nReturn Type: array of objects\n', metadata={'API': 'summarize_objects'}),
 Document(page_content='##API Name: get_similar_work_items \n###Description: Returns a list of work items that are similar to the given work item\n\n###Arguments: \n\nAPI Argumet: work_id\nArgument Description: The ID of the work item for which you want to ﬁnd similar items\nReturn Type: string\n', metadata={'API': 'get_similar_work_items'}),
 Document(page_content='##API Name: works_list \n###Description: Returns a list of work items matching the request\n\n###Arguments: \n\nAPI Argumet: applies_to_part\nArgument Description: Filters for work belonging to any of the provided parts\nReturn Type: array of strings\nAPI Argumet: cr

#### Cohere

In [21]:
client_cohere = chromadb.PersistentClient(path="/home/navcore/Downloads/InterIIT/cohere_db")

cohere_ef  = embedding_functions.CohereEmbeddingFunction(
        api_key="GBq7I4ddeyEfNlmQMXnrpWEsvTXU25emp2Rgh2vM",
        model_name="embed-english-light-v3.0"
    )

collection_cohere = client_cohere.get_or_create_collection(name="cohere_check_1", embedding_function = cohere_ef)

In [22]:
collection_cohere.add(
    documents=doc,
    metadatas=meta,
    ids=id
)

In [23]:
print(collection_cohere.count())
collection_cohere.peek()

9


{'ids': ['ID0', 'ID1', 'ID2', 'ID3', 'ID4', 'ID5', 'ID6', 'ID7', 'ID8'],
 'embeddings': [[0.005859375,
   -0.038269043,
   -0.08068848,
   -0.04574585,
   0.008850098,
   -0.0040245056,
   -0.07348633,
   -0.029815674,
   -0.04562378,
   -0.0044898987,
   0.035583496,
   -0.0413208,
   -0.0077552795,
   -0.037475586,
   0.05706787,
   0.0077552795,
   0.0010480881,
   0.057861328,
   0.0056915283,
   -0.1026001,
   0.13391113,
   0.037719727,
   0.04458618,
   0.021453857,
   -0.07946777,
   -0.06149292,
   -0.04055786,
   0.054138184,
   0.032806396,
   0.046142578,
   0.014129639,
   0.041900635,
   0.1171875,
   0.076538086,
   0.05984497,
   0.0064888,
   -0.109313965,
   0.05050659,
   -0.060272217,
   0.055145264,
   0.10205078,
   -0.036956787,
   -0.07287598,
   -0.008239746,
   -0.06951904,
   0.03363037,
   -0.08129883,
   -0.04864502,
   0.020004272,
   0.0680542,
   0.016815186,
   0.002609253,
   -0.055633545,
   0.07720947,
   -0.014160156,
   -0.032470703,
   0.046295166

In [24]:
cohere_embeddings = CohereEmbeddings(
        model="embed-english-light-v3.0", 
        cohere_api_key= "GBq7I4ddeyEfNlmQMXnrpWEsvTXU25emp2Rgh2vM"
    )

vectorstore_cohere = Chroma(
    collection_name="cohere_check_1",
    embedding_function=cohere_embeddings,
    persist_directory = "/home/navcore/Downloads/InterIIT/cohere_db"
)

In [25]:
retriever_from_llm = MultiQueryRetriever(
    retriever=vectorstore_cohere.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)

unique_docs_cohere = retriever_from_llm.get_relevant_documents(
    query="Summarize the work items owned by user \"Bob\" with low severity, then search for similar work items and prioritize them."
)

print(len(unique_docs_cohere))
unique_docs_cohere

INFO:langchain.retrievers.multi_query:Generated queries: ['Sub query 1: Summarize the work items owned by user "Bob" with low severity.', 'Sub query 2: Search for similar work items.', 'Sub query 3: Prioritize the similar work items.']


6


[Document(page_content='##API Name: works_list \n###Description: Returns a list of work items matching the request\n\n###Arguments: \n\nAPI Argumet: applies_to_part\nArgument Description: Filters for work belonging to any of the provided parts\nReturn Type: array of strings\nAPI Argumet: created_by\nArgument Description: Filters for work created by any of these users\nReturn Type: array of strings\nAPI Argumet: issue.priority\nArgument Description: Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2, p3\nReturn Type: array of strings\nAPI Argumet: issue.rev_orgs\nArgument Description: Filters for issues with any of the provided Rev organizations\nReturn Type: array of strings\nAPI Argumet: limit\nArgument Description: The maximum number of works to return. The default is 50\nReturn Type: integer(int32)\nAPI Argumet: owned_by\nArgument Description: Filters for work owned by any of these users\nReturn Type: array of strings\nAPI Argumet: stage.name\nArgumen