The purpose of this notebook is to explore the use of open source models such as "facebook/opt-125m" and "neuralmagic/Llama-2-7b-chat-quantized.w8a8", these models are relatively small in size and can be used from my g4dn.2xlarge instance.
In addition I compare the output from the open source model to openai.
The opensource model is loaded with vllm serving

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"]='True'

In [2]:
!pip uninstall datasets -y 

Found existing installation: datasets 2.21.0
Uninstalling datasets-2.21.0:
  Successfully uninstalled datasets-2.21.0


In [3]:
!pip install transformers torch -q
!pip install langchain -q
!pip install -U langchain-community -q
!pip install python-dotenv openai -q
!pip3 install pysqlite3-binary -q
!pip install -U sentence-transformers -q
!pip install "datasets==2.21.0" -q


In [4]:
import boto3
import os
def download_file_from_s3(bucket_name, s3_file_key):
    # download files to local environment
    # Create an S3 client
    s3 = boto3.client('s3')
    local_file_path = s3_file_key.split('/')[-1]
    # Download the file from S3
    s3.download_file(bucket_name, s3_file_key, local_file_path)
    print(f"File {s3_file_key} downloaded from {bucket_name} to {local_file_path}")

def delete_file(file_path):
    os.remove(file_path)

In [5]:
import re
import string
import numpy as np
import pandas as pd
import torch

_RE_COMBINE_WHITESPACE = re.compile(r"[ ]+", re.ASCII)
_RE_SHORT_LINES = re.compile("^.{1,3}\n", re.MULTILINE)
_RE_MULTILINE_BREAKS = re.compile("\n+", re.MULTILINE)
_RE_PAGE_CHAR = "\x0c"
_RE_LATIN_WHITESPACE_CHAR = re.compile("\xa0", re.ASCII)


# @markdown  - **clean_text** - clean text spaces,non-printable and line breaks
def clean_text(text):
    """Clean text from several white-space and line-breaks"""
    # remove several line breaks
    text = _RE_LATIN_WHITESPACE_CHAR.sub(" ", text)
    # remove several white spaces
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    # remove very short lines
    text = _RE_SHORT_LINES.sub("\n", text)
    # remove several line breaks
    text = _RE_MULTILINE_BREAKS.sub("\n", text)
    # remove unknown characters or non printable
    text = "".join([x for x in text if x in string.printable])

    return text.strip()

In [6]:
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer
from typing import List

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        # print(documents)
        return [self.model.encode(d).tolist() for d in documents]

    def embed_query(self, query: str) -> List[float]:
        return self.model.encode([query])[0].tolist()

  from tqdm.autonotebook import tqdm, trange


## Load the dataset

In [7]:
#for the test data I will use some (parsed) files from here s3://contract-intelligence-data/client-data/AAA/NY State Insurance/06-FRM-AR1/ 
# these are files of good quality

download_file_from_s3("contract-intelligence-data", "client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2624_2024_163320/FRM-AR117-21-1230-2624_2024_163320.json")
download_file_from_s3("contract-intelligence-data", "client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2638_2024_162334/FRM-AR117-21-1230-2638_2024_162334.json")
download_file_from_s3("contract-intelligence-data", "client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-22-1252-6330_2024_16400/FRM-AR117-22-1252-6330_2024_16400.json")

download_file_from_s3("contract-intelligence-data","client-data/AAA/NY State Insurance/04-RPT-INIT/17-22-1250-8464/17-22-1250-8464.json")
download_file_from_s3("contract-intelligence-data","client-data/dragados/ol-elevated-guideway-and-stations-dmca-redacted-version.json")

File client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2624_2024_163320/FRM-AR117-21-1230-2624_2024_163320.json downloaded from contract-intelligence-data to FRM-AR117-21-1230-2624_2024_163320.json
File client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2638_2024_162334/FRM-AR117-21-1230-2638_2024_162334.json downloaded from contract-intelligence-data to FRM-AR117-21-1230-2638_2024_162334.json
File client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-22-1252-6330_2024_16400/FRM-AR117-22-1252-6330_2024_16400.json downloaded from contract-intelligence-data to FRM-AR117-22-1252-6330_2024_16400.json
File client-data/AAA/NY State Insurance/04-RPT-INIT/17-22-1250-8464/17-22-1250-8464.json downloaded from contract-intelligence-data to 17-22-1250-8464.json
File client-data/dragados/ol-elevated-guideway-and-stations-dmca-redacted-version.json downloaded from contract-intelligence-data to ol-elevated-guideway-and-stations-dmca-redacted-version.json


In [8]:
import json
import glob
from tqdm import tqdm

def read_files(docs_dir: str):
    files = glob.glob(os.path.join(docs_dir,"*.json"), recursive=True)
    print(f"Total number of docs: {len(files)}")
    return files

def compose_dataset(docs_dir: str):
    files = read_files(docs_dir)
    print(files)
    # Read & Load the Dataset
    dataset = []
    for file in tqdm(files):
        # data in json format after ocr
        with open(file) as f:
            pdoc = json.load(f)
        dataset.append(pdoc)

    return dataset

In [9]:
dataset = compose_dataset(".")  

Total number of docs: 6
['./ol-elevated-guideway-and-stations-dmca-redacted-version.json', './FRM-AR117-21-1230-2624_2024_163320.json', './17-22-1250-8464.json', './test.json', './FRM-AR117-21-1230-2638_2024_162334.json', './FRM-AR117-22-1252-6330_2024_16400.json']


100%|██████████| 6/6 [00:00<00:00, 109.77it/s]


In [10]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import CohereEmbeddings, OpenAIEmbeddings
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI

In [11]:
# The rag part, based on the one in LINT api

DEFAULT_CHUNK_SIZE = 3500  #1400 (had to reduce to fit into the facebook/opt-125m model)
DEFAULT_CHUNK_OVERLAP = 500
EMBEDDING_MODEL = "text-embedding-ada-002"#I will still use openai for embeddings
# next step can also try and replace the embeddings for opensource ones
LLM_MODEL_OPENAI = "gpt-3.5-turbo"
vector_db_path = './chroma_db'

SENTENCE_TRANSFORMER_MODEL = "multi-qa-mpnet-base-cos-v1"

In [12]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
import openai
path_to_keys = 'keys.env'
temp = dotenv_values(path_to_keys)
openai_api_key = temp["OPENAI_API_KEY"]

### lets put the data to chroma db

In [13]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [14]:
# !pip install hnswlib==0.7.0 -q
# !pip install chroma-hnswlib==0.7.3 -q
# !pip uninstall hnswlib chroma-hnswlib -y

In [15]:
%pip install chromadb==0.5 tiktoken -q

Note: you may need to restart the kernel to use updated packages.


In [16]:
from langchain.vectorstores import Chroma

def put_in_Chroma(doc_pages, doc_name, embedding_type="openai"):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
    doc = [
                Document(page_content=clean_text(page), metadata={"page": i, "doc_name": doc_name})
                for i, page in enumerate(doc_pages)
            ]
    chunks = text_splitter.split_documents(doc)

    print('chunks: ', len(chunks))
    # Retrieve embedding function from code env resources
    
    if embedding_type == "openai":
        print("Using OpenAI embeddings")
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    else:
        print("Using Sentence Transformer embeddings")
        embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)

    # Index the vector database by embedding then inserting document chunks
    db = Chroma.from_documents(chunks,
                            embedding=embeddings,
                            ids=[str(i) for i in range(len(chunks))],
                            persist_directory=vector_db_path)

    # Save vector database as persistent files in the output folder
    return db

In [19]:
# file_name = 'FRM-AR117-21-1230-2624_2024_163320'
# file_name = 'FRM-AR117-21-1230-2638_2024_162334'
file_name = 'FRM-AR117-22-1252-6330_2024_16400'
file_name = '17-22-1250-8464'
# file_name = 'ol-elevated-guideway-and-stations-dmca-redacted-version'

In [20]:
%%time
for i in dataset:
    if i['name'] == file_name:
        doc_pages = i['text']
        break
print('pages: ', len(doc_pages))
db = put_in_Chroma(doc_pages, doc_name=file_name, embedding_type='transformer')

pages:  8
chunks:  8
Using Sentence Transformer embeddings


CPU times: user 693 ms, sys: 40 ms, total: 733 ms
Wall time: 1.65 s


In [68]:
def get_gpt_llm():
    chat_params = {
        "model": "gpt-3.5-turbo", # Bigger context window
        "openai_api_key": openai_api_key,
        "temperature": 0.000001, 
    }
    llm = ChatOpenAI(**chat_params)
    return llm

def qa_retriever_openai(query, vector_db_path, file_id, k=4, embeddings_type="openai"):
    if embeddings_type == "openai":
        print("Using OpenAI Embeddings")
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    else:
        print("Using Sentence Transformer Embeddings")
        embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)
    vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings, collection_name='my_collection')

    retriever = vectordb.as_retriever(search_kwargs={"k": k, "filter": {"doc_name": file_id}})

    qa = RetrievalQA.from_chain_type(llm=get_gpt_llm(), chain_type="stuff", 
                                    retriever=retriever, return_source_documents=True)
    res = qa({"query": query, "k": k})
    return res, retriever

In [89]:
embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)
vectordb = Chroma(persist_directory='./chroma_db', embedding_function=embeddings, collection_name='my_collection')
# file_name = 'DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement'
file_name = 'PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT '
# file_name = 'CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT'
# file_name = 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement'
retriever = vectordb.as_retriever(search_kwargs={"k": 3, "filter": {"doc_name": file_name}})

In [90]:
qa_chain = RetrievalQA.from_chain_type(
    llm=get_gpt_llm(),  # Specify your language model here
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True  # Include source documents in the response
)

# Step 5: Ask a question on the filtered documents
query = "What are the main topics covered in this document?"
response = qa_chain({"query": query})
response

{'query': 'What are the main topics covered in this document?',
 'result': 'The main topics covered in this document include definitions of terms used in the agreement such as "Product(s)", "Promotional Materials", "Quarter", "Regulatory Authority", "Royalty Cap", "Supply Agreement", "Term", "Territory", "Third Party", "Trademarks", "Transition Services and Inventory Agreement", "Vial", "Year", "Control", "Copyrights", "Current Base Price", "DEA", "Distribution Rights", "Domain Name", "EKR Improvement", "Endo/PPI Unit Sales", "Endo Product", "Effective Date", "FDA", "Field", "Force Majeure", "Improvements", "Joint Improvements", and "Known In-Channel Product Units." The document also covers sections related to the submission of promotional materials, pre-launch and post-launch activities, marketing activities, sub-distributors, and more.',
 'source_documents': [Document(metadata={'doc_name': 'PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT '

In [64]:
question = "Who are the parties?"
question = "When is this agreement entered into?"
# question = "When is this agreement entered into can you give me a quote for evidence?"

# question = "What type of form is that?"

answer, retriever = \
    qa_retriever_openai(question, vector_db_path="./chroma_db", \
    file_id=file_name, k=4, embeddings_type="transformer")

Using Sentence Transformer Embeddings


In [28]:
answer

{'query': 'When is this agreement entered into?',
 'k': 4,
 'result': 'The agreement mentioned in the text was not explicitly stated to have a specific date of entry.',
 'source_documents': [Document(metadata={'doc_name': '17-22-1250-8464', 'page': 0}, page_content='10 04 21\nDate: Gh j poh INITIAL, EXAMINATION REPORT Ds Out: ee) al\n- ASA Resta. OF THE IMPACT THE PATIENT SUSTAINED INIURIES TO;\net DB tbow + Ofoot {/RT GIT\n; cic Spine Wrist} RT HLT. , Tbbdomen G RT YLT\nFAR ROCKAWAY MEDICAL PC\n4014A Beston Rd, Bronx, er vivo\nLY OF P! r ONIONS\nOr. OMs: JA 2 year old Op, vito was\ninvolved in an accifient; the details of the accident sa discussed with the patient. _\nit stale of\nAccording to the information presented by Oshe was in a reg\ngood and was capable of living ef an eal basis with others of @his Cher age,\nbefore a, involved in a Efhotor vehicle Cslip and fall Owork. ident,\nwh is Cher symptoms The patient was the Cdrivr enger of the\n-Cbacke st ofthe vehicle Gefith Llwitho

In [24]:
print('Openai answer: ', answer['result'])

Openai answer:  This agreement is effective as from the date first stated above, which is the DMCA Effective Date. Unfortunately, there is no specific date provided in the context for the DMCA Effective Date.


ran in terminal: `vllm serve neuralmagic/Llama-2-7b-chat-quantized.w8a8 --chat-template templates/template_chatml.jinja`

In [None]:
inference_server_url = "http://localhost:8000/v1"

# MODEL = "facebook/opt-125m"
# MODEL = "neuralmagic/Llama-2-7b-chat-quantized.w8a8"
MODEL = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    
llm = ChatOpenAI(
    model=MODEL,
    openai_api_key="EMPTY",
    openai_api_base=inference_server_url,
    max_tokens=100,
    temperature=0,
)

In [None]:
def qa_retriever_llama(query, vector_db_path, file_id, k=4):
    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings, collection_name='my_collection')

    retriever = vectordb.as_retriever(search_kwargs={"k": k, "filter": {"doc_name": file_id}})

    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
                                    retriever=retriever, return_source_documents=True)
    res = qa({"query": query})
    return res, retriever

In [None]:
%%time
question = "Who are the parties?"
question = "When is this agreement entered into?"

answer_llama, retriever = qa_retriever_llama(question, vector_db_path="/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db", file_id=file_name, k=4)

In [None]:
print(answer_llama)
print(answer_llama['result'])

In [None]:
text = '\n--\n'.join([i.page_content for i in answer['source_documents']])
print(text)

In [None]:
from langchain_core.prompts.prompt import PromptTemplate

In [None]:
llm = ChatOpenAI(
    model=MODEL,
    openai_api_key="EMPTY",
    openai_api_base=inference_server_url,
    max_tokens=200,
    temperature=0,
)

In [None]:
prompt = """You are an AI assistant, use the following text to provide answer if you don't know, say you don't know
        Context: {context}
        Question: {question}
        Be concise and short in your response.
"""

# context = text
question = "Who are the parties?"
# question = "Where did the accident occur?"
# question = "What is the date of the accident?"
# question = "Was the denial of claim based on late notice to the carrier?"
# question = "Who is the insurer?"
# question = "What type of form is that?"

# file_name = 'FRM-AR117-22-1252-6330_2024_16400'

vector_db_path = "/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db"
my_prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 4, "filter": {"doc_name": file_name}})

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                chain_type="stuff", 
                                retriever=retriever, 
                                return_source_documents=True,
                                chain_type_kwargs={"prompt": my_prompt})

In [None]:
%%time
llama_answer = qa.invoke(question)

In [None]:
print(llama_answer)
print(llama_answer['result'])

In [None]:
my_prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings, collection_name='my_collection')
retriever = vectordb.as_retriever(search_kwargs={"k": 4, "filter": {"doc_name": file_name}})
qa = RetrievalQA.from_chain_type(llm=get_gpt_llm(), 
                                chain_type="stuff", 
                                retriever=retriever, 
                                return_source_documents=True,
                                chain_type_kwargs={"prompt": my_prompt})

In [None]:
%%time
openai_answer = qa.invoke(question)

In [None]:
print(openai_answer)
print(openai_answer['result'])

In [None]:
db._collection.get(include=["metadatas","documents"])

In [18]:
import torch
torch.cuda.empty_cache()

--------

In [20]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
number_gpus = 1
max_model_len = 8192

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len, dtype=torch.float16)

INFO 10-07 08:06:25 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantize

OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 40.81 MiB is free. Including non-PyTorch memory, this process has 14.71 GiB memory in use. Of the allocated memory 14.09 GiB is allocated by PyTorch, and 16.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [None]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "write a poem about waterlilies"},
]

prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

-------

Metrics will be calculated with CUAD data

In [18]:
import chromadb
def check_collection_exists(collection_name):
    client = chromadb.Client()
    existing_collections = client.list_collections()
    collection_exists = any(collection.name == collection_name for collection in existing_collections)
    return collection_exists

In [59]:
from langchain.vectorstores import Chroma

def put_in_Chroma(doc_pages, doc_name, embedding_type="openai"):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
    doc = [
                Document(page_content=clean_text(page), metadata={"page": i, "doc_name": doc_name})
                for i, page in enumerate(doc_pages)
            ]
    chunks = text_splitter.split_documents(doc)

    print('chunks: ', len(chunks))
    # Retrieve embedding function from code env resources
    
    if embedding_type == "openai":
        print("Using OpenAI embeddings")
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    else:
        print("Using Sentence Transformer embeddings")
        embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)

    db = Chroma(
            collection_name='my_collection',
            embedding_function=embeddings,
            persist_directory=vector_db_path
        )
    
    if not check_collection_exists('my_collection'):
        db.create_collection(collection_name='my_collection', embedding_function=embeddings)

    db.add_documents(documents=chunks, ids=[str(i)+'_'+doc_name for i in range(len(chunks))], persists_directory=vector_db_path)
    db.persist()
    return db

In [20]:
# # from langchain.vectorstores import Chroma

# def put_in_Chroma_cuad(doc_text: str, doc_name, embedding_type="openai", start_chunks=0, db=None, vector_db_path="./chroma_db"):
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
#     doc = [
#             Document(page_content=page, metadata={"page": i, "doc_name": doc_name})
#                 for i, page in enumerate([doc_text])
#             ]
#     chunks = text_splitter.split_documents(doc)

#     #lets add start and end index to every chunk
#     for ch in chunks:
#         ch.metadata['start_index'] = doc_text.find(ch.page_content)
#         ch.metadata['end_index'] = ch.metadata['start_index']+len(ch.page_content)

#     print('chunks: ', len(chunks))
#     # Retrieve embedding function from code env resources
#     if embedding_type == "openai":
#         print("Using OpenAI embeddings")
#         embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
#     else:
#         print("Using Sentence Transformer embeddings")
#         embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)

#     # Index the vector database by embedding then inserting document chunks
#     # if not db:
#     db = Chroma.add_documents(chunks,
#                                     embedding=embeddings,
#                                     ids=[str(i)+'_'+doc_name for i in range(len(chunks))],
#                                     persist_directory=vector_db_path, collection_name='my_collection')
#     # else:
#     #     db.add_documents(chunks, embedding=embeddings, ids=[str(i)+'_'+doc_name for i in range(len(chunks))], 
#     #                         persist_directory=vector_db_path, collection_name='my_collection')
#     return db, start_chunks+len(chunks)

In [21]:
download_file_from_s3('contract-intelligence-data','yulia_data/grant/data/cuad_data/test.json')

File yulia_data/grant/data/cuad_data/test.json downloaded from contract-intelligence-data to test.json


In [22]:
import json
with open('test.json') as f:
    test_data = json.load(f)

In [23]:
# db._client.delete_collection(name='my_collection')
        # if self.db_client:
        #     result = self.db_client.reset()
        #     self.db_client.clear_system_cache() # very important
        #     self.db_client = None
        #     print(f"remove and reset db_client success: {result}")


In [24]:
len(db._collection.get(include=["metadatas","documents"]))

NameError: name 'db' is not defined

In [25]:
# start_ind = 0
# db = None

In [100]:
#put files into chroma
ind = 1
dd = test_data['data'][ind]
text = dd['paragraphs'][0]['context']
file_name = dd['title']
print(f"File name: {file_name}")
# db, start_ind = \
#     put_in_Chroma_cuad(text, doc_name=file_name, embedding_type='transformer', start_chunks=start_ind, db=db, vector_db_path='./chroma_db_cuad')
put_in_Chroma([text], doc_name=file_name, embedding_type='transformer')

File name: CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT
chunks:  5
Using Sentence Transformer embeddings


<langchain_community.vectorstores.chroma.Chroma at 0x7fd9cdf73490>

In [55]:
df = pd.DataFrame(db._collection.get(include=["metadatas","documents"]))
df

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0_CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-...,,{'doc_name': 'CENTRACKINTERNATIONALINC_10_29_1...,1 Exhibit 10.3\nI-on. (LOGO) www.i-on.com 561....,,
1,0_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2...,,{'doc_name': 'DovaPharmaceuticalsInc_20181108_...,Exhibit 10.2\n________________________________...,,
2,0_LohaCompanyltd_20191209_F-1_EX-10.16_1191787...,,{'doc_name': 'LohaCompanyltd_20191209_F-1_EX-1...,Exhibit 10.16 SUPPLY CONTRACT Contract No: Dat...,,
3,"0_PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC...",,"{'doc_name': 'PACIRA PHARMACEUTICALS, INC. - A...",Exhibit 10.13\nConfidential Materials omitted ...,,
4,10_DovaPharmaceuticalsInc_20181108_10-Q_EX-10....,,{'doc_name': 'DovaPharmaceuticalsInc_20181108_...,2.2.1 responsibility for promoting the Product...,,
...,...,...,...,...,...,...
119,"7_PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC...",,"{'doc_name': 'PACIRA PHARMACEUTICALS, INC. - A...",costs of PPI employees related to this assista...,,
120,8_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2...,,{'doc_name': 'DovaPharmaceuticalsInc_20181108_...,"1.58 ""Specialty Pharmacy Net Sales"" shall mean...",,
121,"8_PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC...",,"{'doc_name': 'PACIRA PHARMACEUTICALS, INC. - A...",-17-\n3.8 Customer Orders. PPI shall at its ow...,,
122,9_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2...,,{'doc_name': 'DovaPharmaceuticalsInc_20181108_...,"1.67 ""Valeant Property"" shall have the meaning...",,


In [45]:
df.metadatas.value_counts()

metadatas
{'doc_name': 'DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement', 'page': 0}                63
{'doc_name': 'PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT ', 'page': 0}    51
{'doc_name': 'CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT', 'page': 0}                            5
{'doc_name': 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement', 'page': 0}                           5
Name: count, dtype: int64

In [76]:
# pd.DataFrame(db._collection.get(include=["metadatas"])['metadatas']).sort_values(by=['doc_name', 'start_index'])

In [46]:
#cuad formulations are not the best for openai instructions, I will start with a subset of the data
map_questions = {"""Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?""":
                    "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise",
                """Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract""":
                    "What is the Document Name? give evidence and be concise",
                """Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract""":
                    "Who are the Parties who signed the contract? be concise",
                """Highlight the parts (if any) of this contract related to "Governing Law" that should be reviewed by a lawyer. Details: Which state/country's law governs the interpretation of the contract?""":
                    "What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise"}

In [50]:
filtered_questions = []
for dd in test_data['data'][:4]:
    questions = {}
    questions['file_name'] = dd['title']
    for qq in dd['paragraphs'][0]['qas']:
        if qq['question'] in map_questions.keys():
            question = map_questions[qq['question']]
            if 'answers' in qq.keys() and len(qq['answers'])>0:
                questions[question] = qq['answers']
    filtered_questions.append(questions)      

In [51]:
data_evidence = pd.DataFrame(filtered_questions)
data_evidence

Unnamed: 0,file_name,What is the Document Name? give evidence and be concise,Who are the Parties who signed the contract? be concise,What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise,What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise
0,LohaCompanyltd_20191209_F-1_EX-10.16_11917878_...,"[{'text': 'SUPPLY CONTRACT', 'answer_start': 14}]","[{'text': 'The seller:', 'answer_start': 143},...","[{'text': 'The Contract is valid for 5 years, ...",[{'text': 'It will be governed by the law of t...
1,CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WE...,"[{'text': 'WEB SITE HOSTING AGREEMENT', 'answe...","[{'text': 'Centrack International', 'answer_st...",[{'text': 'The term of this Agreement for the ...,[{'text': 'This Agreement was entered into in ...
2,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_1...,"[{'text': 'CO-PROMOTION AGREEMENT', 'answer_st...","[{'text': 'Dova', 'answer_start': 857}, {'text...",[{'text': 'This Agreement shall become effecti...,[{'text': 'This Agreement and any and all matt...
3,"PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC L...",[{'text': 'AMENDED AND RESTATED STRATEGIC LICE...,"[{'text': 'PACIRA PHARMACEUTICALS, INC.', 'ans...",[{'text': 'This Agreement shall commence on th...,[{'text': 'This Agreement and the relationship...


In [49]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
data_answers = []
for file in data_evidence.file_name:
    print(file)
    ans = {}
    for question in data_evidence.columns[1:]:
        print(question)
        answer, retriever = \
            qa_retriever_openai(question, vector_db_path="/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db", \
            file_id=file, k=4, embeddings_type="transformer")
        ans[question] = answer
        ans['file_name'] = file
    data_answers.append(ans)

LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement
What is the Document Name? give evidence and be concise
Using Sentence Transformer Embeddings
Who are the Parties who signed the contract? be concise
Using Sentence Transformer Embeddings
What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise
Using Sentence Transformer Embeddings
What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise
Using Sentence Transformer Embeddings
CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT
What is the Document Name? give evidence and be concise
Using Sentence Transformer Embeddings
Who are the Parties who signed the contract? be concise
Using Sentence Transformer Embeddings
What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise
Using Sentence Transformer Embeddings
What is the Gover

In [53]:
data_answers

[{'What is the Document Name? give evidence and be concise': {'query': 'What is the Document Name? give evidence and be concise',
   'k': 4,
   'result': "I don't have enough information to provide a specific answer to your question about the Document Name.",
   'source_documents': []},
  'file_name': 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement',
  'Who are the Parties who signed the contract? be concise': {'query': 'Who are the Parties who signed the contract? be concise',
   'k': 4,
   'result': "I don't have enough information to provide a specific answer.",
   'source_documents': []},
  "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise": {'query': "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise",
   'k': 4,
   'result': "I don't have enough information to provide a specific expiration date for the contract's initial term.",
   

In [74]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

# Step 1: Initialize the embeddings model (e.g., OpenAI's embeddings)
embedding_function = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)

# Step 2: Load the ChromaDB collection using LangChain
persist_directory = "./chroma_db"
collection_name = "my_collection"
vectorstore = Chroma(
    collection_name=collection_name,
    persist_directory=persist_directory,
    embedding_function=embedding_function
)

# Step 3: Define your query text
query_text = "What are the latest advancements in artificial intelligence?"
query_filter = {"doc_name": "PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT"}
# Step 4: Retrieve similar documents
similar_docs = vectorstore.similarity_search(query_text, k=5, filter=query_filter)  # Set k to specify the number of results

# Step 5: Print the similar documents and their metadata
for i, doc in enumerate(similar_docs):
    print(f"Document {i + 1}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)


In [102]:
answer, retriever = \
            qa_retriever_openai('What is the Document Name? give evidence and be concise', vector_db_path="./chroma_db", \
            file_id='CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT', k=4, embeddings_type="transformer")
answer

Using Sentence Transformer Embeddings


{'query': 'What is the Document Name? give evidence and be concise',
 'k': 4,
 'result': 'The document name is "WEB SITE HOSTING AGREEMENT." This is evident from the heading of the document that states "WEB SITE HOSTING AGREEMENT" and the introductory paragraph that mentions "This WEB SITE HOSTING AGREEMENT (\'this Agreement\') is entered into..."',
 'source_documents': [Document(metadata={'doc_name': 'CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT', 'page': 0}, page_content="MISCELLANEOUS\nThis Agreement constitutes the entire understanding and agreement between the parties hereto and supersedes any and all prior or contemporaneous representations, understandings, and agreements between the Customer and i-on with respect to the subject matter hereof, all of which are merged herein. The parties understand that work i-on does in the development and maintenance of Web content and applications for Centrack International is governed by separate agreement(s).\nNothin

In [37]:
data_answers

[{'What is the Document Name? give evidence and be concise': {'query': 'What is the Document Name? give evidence and be concise',
   'k': 4,
   'result': "I don't have enough information to provide a specific answer to your question about the Document Name.",
   'source_documents': []},
  'file_name': 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement',
  'Who are the Parties who signed the contract? be concise': {'query': 'Who are the Parties who signed the contract? be concise',
   'k': 4,
   'result': "I don't have enough information to provide a specific answer.",
   'source_documents': []},
  "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise": {'query': "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise",
   'k': 4,
   'result': "I don't have enough information to provide a specific expiration date for the contract's initial term.",
   

In [32]:
res = pd.DataFrame(data_answers)[['file_name','What is the Document Name? give evidence and be concise',	'Who are the Parties who signed the contract? be concise',	"What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise",	"What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise"]]

In [36]:
res

Unnamed: 0,file_name,What is the Document Name? give evidence and be concise,Who are the Parties who signed the contract? be concise,What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise,What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise
0,LohaCompanyltd_20191209_F-1_EX-10.16_11917878_...,{'query': 'What is the Document Name? give evi...,{'query': 'Who are the Parties who signed the ...,{'query': 'What is the Expiration Date? On wha...,{'query': 'What is the Governing Law? Which st...
1,CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WE...,{'query': 'What is the Document Name? give evi...,{'query': 'Who are the Parties who signed the ...,{'query': 'What is the Expiration Date? On wha...,{'query': 'What is the Governing Law? Which st...
2,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_1...,{'query': 'What is the Document Name? give evi...,{'query': 'Who are the Parties who signed the ...,{'query': 'What is the Expiration Date? On wha...,{'query': 'What is the Governing Law? Which st...
3,"PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC L...",{'query': 'What is the Document Name? give evi...,{'query': 'Who are the Parties who signed the ...,{'query': 'What is the Expiration Date? On wha...,{'query': 'What is the Governing Law? Which st...
4,ThriventVariableInsuranceAccountB_20190701_N-6...,{'query': 'What is the Document Name? give evi...,{'query': 'Who are the Parties who signed the ...,{'query': 'What is the Expiration Date? On wha...,{'query': 'What is the Governing Law? Which st...
5,"HERTZGLOBALHOLDINGS,INC_07_07_2016-EX-10.4-INT...",{'query': 'What is the Document Name? give evi...,{'query': 'Who are the Parties who signed the ...,{'query': 'What is the Expiration Date? On wha...,{'query': 'What is the Governing Law? Which st...


In [33]:
copy_res = pd.DataFrame([])

In [34]:
copy_res['file_name'] = res['file_name']
copy_res['q1_answer'] = res['What is the Document Name? give evidence and be concise'].apply(lambda x: x['result'])
copy_res['q2_answer'] = res['Who are the Parties who signed the contract? be concise'].apply(lambda x: x['result'])
copy_res["q3_answer"] = res["What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise"].apply(lambda x: x['result'])
copy_res["q4_answer"] = res["What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise"].apply(lambda x: x['result'])


copy_res['q1_evidence'] = res['What is the Document Name? give evidence and be concise'].apply(lambda x: [i.metadata for i in x['source_documents']])
copy_res['q2_evidence'] = res['Who are the Parties who signed the contract? be concise'].apply(lambda x: [i.metadata for i in x['source_documents']])
copy_res["q3_evidence"] = res["What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise"].apply(lambda x: [i.metadata for i in x['source_documents']])
copy_res["q4_evidence"] = res["What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise"].apply(lambda x: [i.metadata for i in x['source_documents']])

In [35]:
copy_res.merge(data_evidence, on='file_name')

Unnamed: 0,file_name,q1_answer,q2_answer,q3_answer,q4_answer,q1_evidence,q2_evidence,q3_evidence,q4_evidence,What is the Document Name? give evidence and be concise,Who are the Parties who signed the contract? be concise,What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise,What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise
0,LohaCompanyltd_20191209_F-1_EX-10.16_11917878_...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,[],[],[],[],"[{'text': 'SUPPLY CONTRACT', 'answer_start': 14}]","[{'text': 'The seller:', 'answer_start': 143},...","[{'text': 'The Contract is valid for 5 years, ...",[{'text': 'It will be governed by the law of t...
1,CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WE...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,[],[],[],[],"[{'text': 'WEB SITE HOSTING AGREEMENT', 'answe...","[{'text': 'Centrack International', 'answer_st...",[{'text': 'The term of this Agreement for the ...,[{'text': 'This Agreement was entered into in ...
2,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_1...,"I'm sorry, but I don't have enough information...",I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,[],[],[],[],"[{'text': 'CO-PROMOTION AGREEMENT', 'answer_st...","[{'text': 'Dova', 'answer_start': 857}, {'text...",[{'text': 'This Agreement shall become effecti...,[{'text': 'This Agreement and any and all matt...
3,"PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC L...","I'm sorry, but you haven't provided any specif...",I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,[],[],[],[],[{'text': 'AMENDED AND RESTATED STRATEGIC LICE...,"[{'text': 'PACIRA PHARMACEUTICALS, INC.', 'ans...",[{'text': 'This Agreement shall commence on th...,[{'text': 'This Agreement and the relationship...
4,ThriventVariableInsuranceAccountB_20190701_N-6...,"I'm sorry, but I don't have enough information...",I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,[],[],[],[],"[{'text': 'ENDORSEMENT', 'answer_start': 30}]","[{'text': 'Thrivent Financial for Lutherans', ...",,
5,"HERTZGLOBALHOLDINGS,INC_07_07_2016-EX-10.4-INT...",I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to provide a s...,I don't have enough information to determine t...,[],[],[],[],"[{'text': 'INTELLECTUAL PROPERTY AGREEMENT', '...","[{'text': 'HERC RENTALS INC.', 'answer_start':...",,[{'text': 'This Agreement and all disputes or ...


In [38]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [74]:
# we need to make sure that the location of evidence from cuad and the selected top chunks overlap

In [76]:
# get squad metric for the actual words or semantic similarity score

In [78]:
from evaluate import load
metric = load("squad")

In [79]:
metric.compute(predictions=[{"id": '1', "prediction_text": "The contract's initial term expires after 5 years, but the specific date is not provided in the context."},
                            {"id": '2', "prediction_text": "The contract's initial term expires after 5 years, but the specific date is not provided in the context.1 ."}], 
                references=[{"answers": {"answer_start": [0], 'text': ["The Contract is valid for 5 years, beginning from and ended on ."]}, "id": '1'},
                            {"answers": {"answer_start": [0], "text": ["The Contract is valid for 5 years, beginning from and ended on 1 ."]}, "id": '2'}])

{'exact_match': 0.0, 'f1': 22.64957264957265}

In [40]:
def dcg(vect, k=4):
    res = 0
    for i in range(1, k+1):
        res+=vect[i-1]/np.log2(i+1)
    return res

In [83]:
ans_strt = context[0]['answer_start']
res= [1 if ans_strt >= i.metadata['start_index'] and ans_strt <= i.metadata['end_index'] else 0 for i in answer['source_documents']]
res

[1, 0, 0, 0]

In [44]:
dcg([0,0,0,1])

0.43067655807339306

In [78]:
pd.DataFrame(db._collection.get(include=["metadatas","documents"])['metadatas']).sort_values(by=['doc_name','start_index'])

Unnamed: 0,doc_name,end_index,page,start_index
0,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,3487,0,0
1,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,6407,0,2994
12,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,9456,0,6156
23,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,12820,0,9455
34,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,16149,0,12734
45,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,19130,0,15801
56,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,22516,0,19059
60,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,25432,0,22088
61,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,27892,0,25006
62,DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement,30872,0,27517
