In [1]:
from pypdf import PdfReader 
  
# creating a pdf reader object 
reader = PdfReader('data/Walnut Softech Lionel Paul.pdf') 
  
# printing number of pages in pdf file 
print(len(reader.pages)) 
  
# getting a specific page from the pdf file 
page = reader.pages[0] 
  
# extracting text from page 
text = page.extract_text() 


5



In [3]:
from splitter import chunk
from document_loader import loader
from embedding import embedding
import pysqlite3
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from langchain_chroma import Chroma

def create_vector_store(texts, embedding_type):
    """
    Create a vector store from documents based on the specified embedding type.

    Args:
        texts (list): List of texts to create vectors for.
        embedding_type (str): Type of embedding to use. 'open_ai' for OpenAI embeddings,
                              'gemini' for Google Generative AI embeddings.

    Returns:
        Chroma: Vector store created using the specified embedding type.
    
    Raises:
        ValueError: If an unsupported embedding type is provided.
    """
    if embedding_type == 'open_ai':
        openai_embedding = embedding.load_embedding('open_ai')
        db = Chroma.from_documents(texts, openai_embedding, persist_directory= "vector_store")
        return db
    
    elif embedding_type == 'gemini':
        gemini_embedding = embedding.load_embedding('gemini')
        db = Chroma.from_documents(texts, gemini_embedding, persist_directory= "./data")
    
    
    else:
        raise ValueError(f"Unsupported embedding type: {embedding_type}")
    

if __name__ == "__main__":

    # Load the document pdf/text/folder of docs
    docs = loader.load_docs('data/delio', type='folder')

    # Split the loaded document into chunks based on character/tokens
    split_texts = chunk.split(docs, 'character')

    # Create an embedded vector store from the split texts using the openai/gemini embedding type
    embedded_vector_store = create_vector_store(split_texts, embedding_type='gemini')

    create_vector_store(split_texts, embedding_type='gemini')


In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from splitter import chunk
from document_loader import loader
from embedding import embedding
from langchain_text_splitters import CharacterTextSplitter

gemini_embedding = embedding.load_embedding('gemini')

directory_path = 'data/poc_files'
pdf_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.pdf')]

docs = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(directory_path, pdf_file)
    loader = PyPDFLoader(pdf_file)
    pages = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1552, chunk_overlap=0)
    split_pages = text_splitter.split_documents(pages)
    docs.extend(split_pages)
db = FAISS.from_documents(docs, gemini_embedding)
print(db.index.ntotal)
db.save_local("data/jll_poc_vs")


131


In [7]:
embedded_vector_store

In [4]:
embedded_vector_store.as_retriever().invoke('ROLES & SECURITY PROPERTIES')

AttributeError: 'NoneType' object has no attribute 'as_retriever'

In [22]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
gemini_embedding = embedding.load_embedding('gemini')

loader = DirectoryLoader('/workspaces/rag-virtusa/data/delio', glob="**/*.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1552, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, gemini_embedding)
print(db.index.ntotal)
db.save_local("data/faiss")

208


In [None]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(docs, gemini_embedding)

In [18]:
db.as_retriever().invoke('What is Delio')

[Document(metadata={'source': '/workspaces/rag-virtusa/data/delio/Dealio User Guide.pdf'}, page_content='110\n\n119\n\n3\n\nDealio User Guide\n\nINTRODUCTION\n\nDealio is a SOX compliant deal management system that tracks and reports on pipelines for several business units, including Capital Markets, Brokerage, Canada Brokerage, Retail and Corporate Solutions. Specific capabilities include:\n\n• • • • Revenue recording • Budget tracking • Deal tracking and prospecting • Report Generation • Document storage\n\nPipeline management Commission calculation Invoice creation Collection management\n\nCHAPTER 1: DEALIO ACCESS\n\nJLL currently uses Okta’s Single Sign-On (SSO) authentication process which allow users to gain access to multiple applications with one set of credentials – your network username and password.\n\nIf you have a JLL issued laptop that is connected to the corporate network, your login to Dealio will be automatic. If you are connected to Dealio remotely, you will be prompt

In [7]:
import cv2.typing

ModuleNotFoundError: No module named 'cv2.typing'; 'cv2' is not a package

In [1]:
from typing import Union
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, DirectoryLoader

def load_docs(path: str, doc_type: str) -> Union[str, list]:
    """
    Load documents from specified path based on document type.

    Args:
        path (str): Path to the document(s) or folder containing documents.
        doc_type (str): Type of document(s) to load. Supported types are 'text', 'pdf', or 'folder'.

    Returns:
        Union[str, list]: Loaded document(s) as a string for single documents ('text', 'pdf'),
                          or a list of strings for multiple text files ('folder').
    
    Raises:
        ValueError: If an unsupported document type is provided.
    """
    if doc_type == 'text':
        loader = TextLoader(path)
    elif doc_type == 'pdf':
        loader = PyMuPDFLoader(path)
    elif doc_type == 'folder':
        loader = DirectoryLoader(path, glob="**/*.txt", loader_cls=TextLoader)
    else:
        raise ValueError(f"Unsupported document type: {doc_type}")

    loaded = loader.load()
    
    return loaded


In [2]:
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, DirectoryLoader
loader = PyMuPDFLoader('data/sample_doc.pdf')

In [3]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [4]:
loader.load()[0]

Document(metadata={'source': 'data/sample_doc.pdf', 'file_path': 'data/sample_doc.pdf', 'page': 0, 'total_pages': 8, 'format': 'PDF 1.7', 'title': '', 'author': 'Jaccarino, Marcus', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20231228181808-05'00'", 'modDate': "D:20240104133558+05'30'", 'trapped': ''}, page_content=' \n \n \nBSC – RhythmCare POC SOW \nSTATEMENT OF WORK  \nStatement of Work No. 01 \nRhythmCare POC \nBY AND BETWEEN \nBOSTON SCIENTIFIC CORPORATION \nAND \nVIRTUSA CORPORATION \n \nThis Statement of Work No 01 (“Statement of Work” or “SOW”), is entered into as of December 26, 2023 (“SOW \nEffective Date”) pursuant to the Master Services Agreement between Boston Scientific Corporation (the “Client” \nor “BSC”) and Virtusa Corporation (the “Vendor” or Service Provider”), with an effective date of October 01, \n2022 (the “Agreement”). For the purpose of this SOW, Client Shall

In [2]:
# Importing necessary modules and functions from different components
from agents import agents
from document_loader import loader
from embedding import embedding
from models import load_model
from retriever import retriever
from vector_store import create_vector_store
from splitter import chunk
from tools import tools
from prompt_templates import prompts

# Load the document pdf/text/folder of docs
docs = loader.load_docs('data/sample_doc.pdf', type='pdf')

# Split the loaded document into chunks based on character/tokens
split_texts = chunk.split(docs, 'character')

# Create an embedded vector store from the split texts using the openai/gemini embedding type
embedded_vector_store = create_vector_store.create_vector_store(split_texts, embedding_type='gemini')

# Define the input query for retrieval
query = 'what is the scope of this SOW'

# Retrieve relevant context from the embedded vector store based on the query
retrieve_context = retriever.retrive(embedded_vector_store, query, retrieval_type='mmr')

# Define the prompt for the language model
prompt = prompts._generate_prompt(query, retrieve_context)

# Load the language model of type 'gemini'/openai
llm = load_model.load_llm(type='gemini')

# # Invoke the language model with the combined prompt and query to get the output
# output = llm.invoke(prompt + query)


  from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1722861925.220695    1921 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


AttributeError: 'NoneType' object has no attribute 'as_retriever'

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x78f731401c90>)

In [15]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, embedded_vector_store.as_retriever(), contextualize_q_prompt
)


qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


chat_history = []

question = "What is the scope of the project"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

In [17]:
print(ai_msg_1['answer'])

The scope of this SOW is to: 
* Create a Proof-Of-Concept (POC) to demonstrate a virtual assistant that can assist the BSC’s field representatives to respond to questions based on current/latest version of IFU (Information for Use) and DFU (Design for Use) documents. 
* Develop a Virtual assistant to provide interactive multi-lingual text-based chat experience to the end-user, with an initial scope to support English & Spanish languages (depending upon multilingual support by zammo.ai). 
* Integrate a Chatbot experience based on Field Hospital work profiles, limited to FCS (Field Clinical Specialist). Create a system experience to be enhanced to create additional personas.  
* Integration of the Chatbot and Generative AI models will include manuals & guidelines to be used from publicly available Boston Scientific Consulting web content. 
* Development of the Virtual assistant that is limited to a “responsive web application only” that can be used from mobile and desktop, to ascertain t

In [4]:
from embedding import embedding
import pysqlite3
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from langchain_chroma import Chroma

def create_vector_store(texts, embedding_type):
    """
    Create a vector store from documents based on the specified embedding type.

    Args:
        texts (list): List of texts to create vectors for.
        embedding_type (str): Type of embedding to use. 'open_ai' for OpenAI embeddings,
                              'gemini' for Google Generative AI embeddings.

    Returns:
        Chroma: Vector store created using the specified embedding type.
    
    Raises:
        ValueError: If an unsupported embedding type is provided.
    """
    if embedding_type == 'open_ai':
        openai_embedding = embedding.load_embedding('open_ai')
        db = Chroma.from_documents(texts, openai_embedding, persist_directory= "vector_store")
        return db
    
    elif embedding_type == 'gemini':
        gemini_embedding = embedding.load_embedding('gemini')
        db = Chroma.from_documents(texts, gemini_embedding, persist_directory= "vector_store/test.db")
    
        return db
    
    else:
        raise ValueError(f"Unsupported embedding type: {embedding_type}")


In [5]:
from embedding import embedding
gemini_embedding = embedding.load_embedding('gemini')
vectordb = Chroma(persist_directory='./data', embedding_function=gemini_embedding)

In [6]:
vectordb.as_retriever().invoke('What is delio')

[Document(metadata={'author': 'Jaccarino, Marcus', 'creationDate': "D:20231228181808-05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': 'data/sample_doc.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240104133558+05'30'", 'page': 1, 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'data/sample_doc.pdf', 'subject': '', 'title': '', 'total_pages': 8, 'trapped': ''}, page_content='BSC – RhythmCare POC SOW \n• \nDevelopment of a standalone Generative AI application leveraging zammo.ai platform and LLMs \navailable through the zammo.ai platform, limited to document context provided by BSC. \n• \nDevelopment will consist of using Open AI, foundation LLMs and other appropriate cloud services \navailable through zammo.ai platform. \n• \nTune the chat responses by working closely with the identified set of POC users and enhance the \nprompts, guardrails to achieve the outcome based on the feedback by the POC users for a period of two \nweeks \n• \nReport

In [33]:
from splitter import chunk
from document_loader import loader
from vector_store import create_vector_store
# Load the document pdf/text/folder of docs
docs = loader.load_docs('data/sample_doc.pdf', type='pdf')

# Split the loaded document into chunks based on character/tokens
split_texts = chunk.split(docs, 'character')

# Create an embedded vector store from the split texts using the openai/gemini embedding type
embedded_vector_store = create_vector_store.create_vector_store(split_texts, embedding_type='gemini')


In [1]:
gemini_embedding = embedding.load_embedding('gemini')
db = Chroma.from_documents(split_texts, gemini_embedding, persist_directory= "./vector_store")

NameError: name 'embedding' is not defined

In [28]:
embedded_vector_store._persist_directory

'./chroma'

In [18]:
second_question = "can you elaborate on the second point"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

The second point in the scope outlines the development of a **multi-lingual text-based virtual assistant**, focusing on these key aspects:

* **Interactive Chat:** The virtual assistant will primarily interact with users through a text-based chat interface, similar to popular chatbot experiences.
* **Multi-lingual Support:**  It will be capable of understanding and responding in both English and Spanish. However, this is dependent on the capabilities of the "zammo.ai" platform they are using. If "zammo.ai" doesn't support Spanish, the multi-lingual aspect might be limited.
* **Target Users:** While not explicitly stated, the context suggests this virtual assistant is aimed at supporting Boston Scientific's field representatives, helping them access information quickly and efficiently. 

Essentially, this point highlights the core functionality of the project: a chatbot-like tool that can understand and respond to user queries in multiple languages. 



In [43]:
ai_msg_2['context'][0].metadata

{'author': 'Jaccarino, Marcus',
 'creationDate': "D:20231228181808-05'00'",
 'creator': 'Microsoft® Word for Microsoft 365',
 'file_path': 'data/sample_doc.pdf',
 'format': 'PDF 1.7',
 'keywords': '',
 'modDate': "D:20240104133558+05'30'",
 'page': 0,
 'producer': 'Microsoft® Word for Microsoft 365',
 'source': 'data/sample_doc.pdf',
 'subject': '',
 'title': '',
 'total_pages': 8,
 'trapped': ''}

In [None]:
from langchain.memory import ChatMessageHistory

In [19]:
import os
from typing import List, Iterable, Any

from dotenv import load_dotenv
from langchain.memory import ChatMessageHistory
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.retrievers import BaseRetriever
from langchain_core.runnables.history import RunnableWithMessageHistory

# from basic_chain import get_model
# from rag_chain import make_rag_chain


def create_memory_chain(llm, chat_memory):
    contextualize_q_system_prompt = """Given a chat history and the latest user question \
        which might reference context in the chat history, formulate a standalone question \
        which can be understood without the chat history. Do NOT answer the question, \
        just reformulate it if needed and otherwise return it as is."""

    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{question}"),
        ]
    )

    runnable = contextualize_q_prompt | llm 

    def get_session_history(session_id: str) -> BaseChatMessageHistory:
        return chat_memory

    with_message_history = RunnableWithMessageHistory(
        runnable,
        get_session_history,
        input_messages_key="question",
        history_messages_key="chat_history",
    )
    return with_message_history


class SimpleTextRetriever(BaseRetriever):
    docs: List[Document]
    """Documents."""

    @classmethod
    def from_texts(
            cls,
            texts: Iterable[str],
            **kwargs: Any,
    ):
        docs = [Document(page_content=t) for t in texts]
        return cls(docs=docs, **kwargs)

    def _get_relevant_documents(
            self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        return self.docs

In [20]:
create_memory_chain(llm, chat_memory={})

RunnableWithMessageHistory(bound=RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  chat_history: RunnableBinding(bound=RunnableLambda(_enter_history), config={'run_name': 'load_history'})
}), config={'run_name': 'insert_history'})
| RunnableBranch(branches=[(RunnableBinding(bound=RunnableLambda(_is_not_async), config={'run_name': 'RunnableWithMessageHistoryInAsyncMode'}), RunnableBinding(bound=ChatPromptTemplate(input_variables=['chat_history', 'question'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question         which might reference context in the chat history, 

In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.memory.chat_message_histories.in_memory import ChatMessageHistory
from langchain.schema import messages_from_dict, messages_to_dict
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain, ConversationChain
import json

In [5]:
original_chain = ConversationChain(
    llm=llm,
    verbose=True,
    memory=ConversationBufferMemory()
)

  warn_deprecated(


In [None]:
Chroma(persist_directory='data',embedding_function=embedding)

In [17]:
output = original_chain.run(prompt)



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: You are a helpful bot that will answer the user's query
                <query>
                what is the scope of this SOW
                </query>

                based on the given context 
                <context>
                [Document(metadata={'author': 'Jaccarino, Marcus', 'creationDate': "D:20231228181808-05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': 'data/sample_doc.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240104133558+05'30'", 'page': 0, 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'data/sample_doc.pdf', 'subject': '', 'title': '', 'total_pages': 8, 'trapped': ''}, pag

In [18]:
output

'The scope of the SOW is to: \n\n* Create a Proof-Of-Concept (POC) to demonstrate a virtual assistant that can assist the BSC’s field representatives to respond to questions based on current/latest version of IFU (Information for Use) and DFU (Design for Use) documents. \n* Develop a Virtual assistant to provide interactive multi-lingual text-based chat experience to the end-user, with an initial scope to support English & Spanish languages (depending upon multilingual support by zammo.ai). \n* Integrate a Chatbot experience based on Field Hospital work profiles, limited to FCS (Field Clinical Specialist). Create a system experience to be enhanced to create additional personas.  \n* Integration of the Chatbot and Generative AI models will include manuals & guidelines to be used from publicly available Boston Scientific Consulting web content. \n* Development of the Virtual assistant that is limited to a “responsive web application only” that can be used from mobile and desktop, to asce

In [9]:
original_chain.run(prompt)
extracted_messages = original_chain.memory.chat_memory.messages
ingest_to_db = messages_to_dict(extracted_messages)



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: You are a helpful bot that will answer the user's query
                <query>
                what is the scope of this SOW
                </query>

                based on the given context 
                <context>
                [Document(metadata={'author': 'Jaccarino, Marcus', 'creationDate': "D:20231228181808-05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': 'data/sample_doc.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240104133558+05'30'", 'page': 0, 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'data/sample_doc.pdf', 'subject': '', 'title': '', 'total_pages': 8, 'trapped': ''}, pag

In [10]:
ingest_to_db

[{'type': 'human',
  'data': {'content': 'You are a helpful bot that will answer the user\'s query\n                <query>\n                what is the scope of this SOW\n                </query>\n\n                based on the given context \n                <context>\n                [Document(metadata={\'author\': \'Jaccarino, Marcus\', \'creationDate\': "D:20231228181808-05\'00\'", \'creator\': \'Microsoft® Word for Microsoft 365\', \'file_path\': \'data/sample_doc.pdf\', \'format\': \'PDF 1.7\', \'keywords\': \'\', \'modDate\': "D:20240104133558+05\'30\'", \'page\': 0, \'producer\': \'Microsoft® Word for Microsoft 365\', \'source\': \'data/sample_doc.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 8, \'trapped\': \'\'}, page_content=\'BSC – RhythmCare POC SOW \\nSTATEMENT OF WORK  \\nStatement of Work No. 01 \\nRhythmCare POC \\nBY AND BETWEEN \\nBOSTON SCIENTIFIC CORPORATION \\nAND \\nVIRTUSA CORPORATION \\n \\nThis Statement of Work No 01 (“Statement of Work” or “SO

In [15]:
message = messages_from_dict(ingest_to_db)

In [16]:
ChatMessageHistory(messages = message)

InMemoryChatMessageHistory(messages=[HumanMessage(content='You are a helpful bot that will answer the user\'s query\n                <query>\n                what is the scope of this SOW\n                </query>\n\n                based on the given context \n                <context>\n                [Document(metadata={\'author\': \'Jaccarino, Marcus\', \'creationDate\': "D:20231228181808-05\'00\'", \'creator\': \'Microsoft® Word for Microsoft 365\', \'file_path\': \'data/sample_doc.pdf\', \'format\': \'PDF 1.7\', \'keywords\': \'\', \'modDate\': "D:20240104133558+05\'30\'", \'page\': 0, \'producer\': \'Microsoft® Word for Microsoft 365\', \'source\': \'data/sample_doc.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 8, \'trapped\': \'\'}, page_content=\'BSC – RhythmCare POC SOW \\nSTATEMENT OF WORK  \\nStatement of Work No. 01 \\nRhythmCare POC \\nBY AND BETWEEN \\nBOSTON SCIENTIFIC CORPORATION \\nAND \\nVIRTUSA CORPORATION \\n \\nThis Statement of Work No 01 (“Statemen

In [8]:
original_chain.run(prompt)
extracted_messages = original_chain.memory.chat_memory.messages
ingest_to_db = messages_to_dict(extracted_messages)
retrieve_from_db = json.loads(json.dumps(ingest_to_db))
retrieved_messages = messages_from_dict(retrieve_from_db)
retrieved_chat_history = ChatMessageHistory(messages=retrieved_messages)
retrieved_memory = ConversationBufferMemory(chat_memory=retrieved_chat_history)

reloaded_chain = ConversationChain(
    llm=llm,
    verbose=True,
    memory=retrieved_memory
)
reloaded_chain.run('what about Javascript and Ruby')

  warn_deprecated(




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: You are a helpful bot that will answer the user's query
                <query>
                what is the scope of this SOW
                </query>

                based on the given context 
                <context>
                [Document(metadata={'author': 'Jaccarino, Marcus', 'creationDate': "D:20231228181808-05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': 'data/sample_doc.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240104133558+05'30'", 'page': 0, 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'data/sample_doc.pdf', 'subject': '', 'title': '', 'total_pages': 8, 'trapped': ''}, pa

"I'm sorry, but the provided text does not mention anything about Javascript or Ruby. Therefore, I cannot answer your question based on the given context. \n"

In [8]:
original_chain(prompt)

  warn_deprecated(




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: You are a helpful bot that will answer the user's query
                <query>
                what is the scope of this SOW
                </query>

                based on the given context 
                <context>
                [Document(metadata={'author': 'Jaccarino, Marcus', 'creationDate': "D:20231228181808-05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': 'data/sample_doc.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240104133558+05'30'", 'page': 0, 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'data/sample_doc.pdf', 'subject': '', 'title': '', 'total_pages': 8, 'trapped': ''}, pa

{'input': 'You are a helpful bot that will answer the user\'s query\n                <query>\n                what is the scope of this SOW\n                </query>\n\n                based on the given context \n                <context>\n                [Document(metadata={\'author\': \'Jaccarino, Marcus\', \'creationDate\': "D:20231228181808-05\'00\'", \'creator\': \'Microsoft® Word for Microsoft 365\', \'file_path\': \'data/sample_doc.pdf\', \'format\': \'PDF 1.7\', \'keywords\': \'\', \'modDate\': "D:20240104133558+05\'30\'", \'page\': 0, \'producer\': \'Microsoft® Word for Microsoft 365\', \'source\': \'data/sample_doc.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 8, \'trapped\': \'\'}, page_content=\'BSC – RhythmCare POC SOW \\nSTATEMENT OF WORK  \\nStatement of Work No. 01 \\nRhythmCare POC \\nBY AND BETWEEN \\nBOSTON SCIENTIFIC CORPORATION \\nAND \\nVIRTUSA CORPORATION \\n \\nThis Statement of Work No 01 (“Statement of Work” or “SOW”), is entered into as of Dece

[HumanMessage(content='You are a helpful bot that will answer the user\'s query\n                <query>\n                what is the scope of this SOW\n                </query>\n\n                based on the given context \n                <context>\n                [Document(metadata={\'author\': \'Jaccarino, Marcus\', \'creationDate\': "D:20231228181808-05\'00\'", \'creator\': \'Microsoft® Word for Microsoft 365\', \'file_path\': \'data/sample_doc.pdf\', \'format\': \'PDF 1.7\', \'keywords\': \'\', \'modDate\': "D:20240104133558+05\'30\'", \'page\': 0, \'producer\': \'Microsoft® Word for Microsoft 365\', \'source\': \'data/sample_doc.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 8, \'trapped\': \'\'}, page_content=\'BSC – RhythmCare POC SOW \\nSTATEMENT OF WORK  \\nStatement of Work No. 01 \\nRhythmCare POC \\nBY AND BETWEEN \\nBOSTON SCIENTIFIC CORPORATION \\nAND \\nVIRTUSA CORPORATION \\n \\nThis Statement of Work No 01 (“Statement of Work” or “SOW”), is entered int

[{'type': 'human',
  'data': {'content': 'You are a helpful bot that will answer the user\'s query\n                <query>\n                what is the scope of this SOW\n                </query>\n\n                based on the given context \n                <context>\n                [Document(metadata={\'author\': \'Jaccarino, Marcus\', \'creationDate\': "D:20231228181808-05\'00\'", \'creator\': \'Microsoft® Word for Microsoft 365\', \'file_path\': \'data/sample_doc.pdf\', \'format\': \'PDF 1.7\', \'keywords\': \'\', \'modDate\': "D:20240104133558+05\'30\'", \'page\': 0, \'producer\': \'Microsoft® Word for Microsoft 365\', \'source\': \'data/sample_doc.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 8, \'trapped\': \'\'}, page_content=\'BSC – RhythmCare POC SOW \\nSTATEMENT OF WORK  \\nStatement of Work No. 01 \\nRhythmCare POC \\nBY AND BETWEEN \\nBOSTON SCIENTIFIC CORPORATION \\nAND \\nVIRTUSA CORPORATION \\n \\nThis Statement of Work No 01 (“Statement of Work” or “SO

[{'type': 'human',
  'data': {'content': 'You are a helpful bot that will answer the user\'s query\n                <query>\n                what is the scope of this SOW\n                </query>\n\n                based on the given context \n                <context>\n                [Document(metadata={\'author\': \'Jaccarino, Marcus\', \'creationDate\': "D:20231228181808-05\'00\'", \'creator\': \'Microsoft® Word for Microsoft 365\', \'file_path\': \'data/sample_doc.pdf\', \'format\': \'PDF 1.7\', \'keywords\': \'\', \'modDate\': "D:20240104133558+05\'30\'", \'page\': 0, \'producer\': \'Microsoft® Word for Microsoft 365\', \'source\': \'data/sample_doc.pdf\', \'subject\': \'\', \'title\': \'\', \'total_pages\': 8, \'trapped\': \'\'}, page_content=\'BSC – RhythmCare POC SOW \\nSTATEMENT OF WORK  \\nStatement of Work No. 01 \\nRhythmCare POC \\nBY AND BETWEEN \\nBOSTON SCIENTIFIC CORPORATION \\nAND \\nVIRTUSA CORPORATION \\n \\nThis Statement of Work No 01 (“Statement of Work” or “SO

In [5]:
llm.invoke(prompt)

AIMessage(content='The scope of the SOW is to:\n\n* Create a Proof-Of-Concept (POC) to demonstrate a virtual assistant that can assist BSC’s field representatives to respond to questions based on current/latest version of IFU (Information for Use) and DFU (Design for Use) documents. \n* Develop a Virtual assistant to provide interactive multi-lingual text-based chat experience to the end-user, with an initial scope to support English & Spanish languages (depending upon multilingual support by zammo.ai). \n* Integrate a Chatbot experience based on Field Hospital work profiles, limited to FCS (Field Clinical Specialist). Create a system experience to be enhanced to create additional personas.  \n* Integration of the Chatbot and Generative AI models will include manuals & guidelines to be used from publicly available Boston Scientific Consulting web content. \n* Development of the Virtual assistant that is limited to a “responsive web application only” that can be used from mobile and des

In [1]:
import numpy as np

In [19]:
x = np.array([[1,2,3],[1,5,6]])

In [30]:
y=np.array([2,2,2])

In [38]:
y.reshape(1,-1)

array([[2, 2, 2]])

In [14]:
np.matmul(x,y)

array([12, 30])

In [20]:
x_t = x.transpose()

In [15]:
x_t

array([[1, 4],
       [2, 5],
       [3, 6]])

In [21]:
product = np.matmul(x_t, x)

In [23]:
product

array([[ 2,  7,  9],
       [ 7, 29, 36],
       [ 9, 36, 45]])

In [22]:
np.linalg.inv(product)

array([[-2.81474977e+14, -2.81474977e+14,  2.81474977e+14],
       [-2.81474977e+14, -2.81474977e+14,  2.81474977e+14],
       [ 2.81474977e+14,  2.81474977e+14, -2.81474977e+14]])

In [45]:
import numpy as np
def linear_regression_normal_equation(X: list[list[float]], y: list[float]) -> list[float]:
    X = np.array(X)
    y = np.array(y)
    X_t = X.transpose()
    left = np.linalg.inv(np.matmul(X_t,X))
    # print(X.shape, y.shape)

    temp = np.matmul(left,X_t)
    right = np.matmul(temp,y)
    
    theta = np.dot(left,right)
    return theta

In [46]:
print(linear_regression_normal_equation([[1, 1], [1, 2], [1, 3]], [1, 2, 3]))

(3, 2) (3,)
[-1.77635684e-15  1.00000000e+00]
