The purpose of this notebook is to explore the use of open source models such as "facebook/opt-125m" and "neuralmagic/Llama-2-7b-chat-quantized.w8a8", these models are relatively small in size and can be used from my g4dn.2xlarge instance.
In addition I compare the output from the open source model to openai.
The opensource model is loaded with vllm serving

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"]='True'

In [3]:
!pip uninstall datasets -y 

Found existing installation: datasets 2.21.0
Uninstalling datasets-2.21.0:
  Successfully uninstalled datasets-2.21.0


In [4]:
!pip install transformers torch -q
!pip install langchain -q
!pip install -U langchain-community -q
!pip install python-dotenv openai -q
!pip3 install pysqlite3-binary -q
!pip install -U sentence-transformers -q
!pip install "datasets==2.21.0" -q


In [5]:
import boto3
import os
def download_file_from_s3(bucket_name, s3_file_key):
    # download files to local environment
    # Create an S3 client
    s3 = boto3.client('s3')
    local_file_path = s3_file_key.split('/')[-1]
    # Download the file from S3
    s3.download_file(bucket_name, s3_file_key, local_file_path)
    print(f"File {s3_file_key} downloaded from {bucket_name} to {local_file_path}")

def delete_file(file_path):
    os.remove(file_path)

In [6]:
import re
import string

_RE_COMBINE_WHITESPACE = re.compile(r"[ ]+", re.ASCII)
_RE_SHORT_LINES = re.compile("^.{1,3}\n", re.MULTILINE)
_RE_MULTILINE_BREAKS = re.compile("\n+", re.MULTILINE)
_RE_PAGE_CHAR = "\x0c"
_RE_LATIN_WHITESPACE_CHAR = re.compile("\xa0", re.ASCII)


# @markdown  - **clean_text** - clean text spaces,non-printable and line breaks
def clean_text(text):
    """Clean text from several white-space and line-breaks"""
    # remove several line breaks
    text = _RE_LATIN_WHITESPACE_CHAR.sub(" ", text)
    # remove several white spaces
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    # remove very short lines
    text = _RE_SHORT_LINES.sub("\n", text)
    # remove several line breaks
    text = _RE_MULTILINE_BREAKS.sub("\n", text)
    # remove unknown characters or non printable
    text = "".join([x for x in text if x in string.printable])

    return text.strip()

In [7]:
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer
from typing import List

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        return [self.model.encode(d).tolist() for d in documents]

    def embed_query(self, query: str) -> List[float]:
        return self.model.encode([query])[0].tolist()

  from tqdm.autonotebook import tqdm, trange


## Load the dataset

In [8]:
#for the test data I will use some (parsed) files from here s3://contract-intelligence-data/client-data/AAA/NY State Insurance/06-FRM-AR1/ 
# these are files of good quality

download_file_from_s3("contract-intelligence-data", "client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2624_2024_163320/FRM-AR117-21-1230-2624_2024_163320.json")
download_file_from_s3("contract-intelligence-data", "client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2638_2024_162334/FRM-AR117-21-1230-2638_2024_162334.json")
download_file_from_s3("contract-intelligence-data", "client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-22-1252-6330_2024_16400/FRM-AR117-22-1252-6330_2024_16400.json")

download_file_from_s3("contract-intelligence-data","client-data/AAA/NY State Insurance/04-RPT-INIT/17-22-1250-8464/17-22-1250-8464.json")
download_file_from_s3("contract-intelligence-data","client-data/dragados/ol-elevated-guideway-and-stations-dmca-redacted-version.json")

File client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2624_2024_163320/FRM-AR117-21-1230-2624_2024_163320.json downloaded from contract-intelligence-data to FRM-AR117-21-1230-2624_2024_163320.json
File client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-21-1230-2638_2024_162334/FRM-AR117-21-1230-2638_2024_162334.json downloaded from contract-intelligence-data to FRM-AR117-21-1230-2638_2024_162334.json
File client-data/AAA/NY State Insurance/06-FRM-AR1/FRM-AR117-22-1252-6330_2024_16400/FRM-AR117-22-1252-6330_2024_16400.json downloaded from contract-intelligence-data to FRM-AR117-22-1252-6330_2024_16400.json
File client-data/AAA/NY State Insurance/04-RPT-INIT/17-22-1250-8464/17-22-1250-8464.json downloaded from contract-intelligence-data to 17-22-1250-8464.json
File client-data/dragados/ol-elevated-guideway-and-stations-dmca-redacted-version.json downloaded from contract-intelligence-data to ol-elevated-guideway-and-stations-dmca-redacted-version.json


In [9]:
import json
import glob
from tqdm import tqdm

def read_files(docs_dir: str):
    files = glob.glob(os.path.join(docs_dir,"*.json"), recursive=True)
    print(f"Total number of docs: {len(files)}")
    return files

def compose_dataset(docs_dir: str):
    files = read_files(docs_dir)
    print(files)
    # Read & Load the Dataset
    dataset = []
    for file in tqdm(files):
        # data in json format after ocr
        with open(file) as f:
            pdoc = json.load(f)
        dataset.append(pdoc)

    return dataset

In [10]:
dataset = compose_dataset(".")  

Total number of docs: 6
['./ol-elevated-guideway-and-stations-dmca-redacted-version.json', './FRM-AR117-21-1230-2624_2024_163320.json', './17-22-1250-8464.json', './test.json', './FRM-AR117-21-1230-2638_2024_162334.json', './FRM-AR117-22-1252-6330_2024_16400.json']


100%|██████████| 6/6 [00:00<00:00, 110.12it/s]


In [11]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import CohereEmbeddings, OpenAIEmbeddings
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI

import pandas as pd

In [12]:
# The rag part, based on the one in LINT api

DEFAULT_CHUNK_SIZE = 3500  #1400 (had to reduce to fit into the facebook/opt-125m model)
DEFAULT_CHUNK_OVERLAP = 500
EMBEDDING_MODEL = "text-embedding-ada-002"#I will still use openai for embeddings
# next step can also try and replace the embeddings for opensource ones
LLM_MODEL_OPENAI = "gpt-3.5-turbo"
vector_db_path = './chroma_db'

SENTENCE_TRANSFORMER_MODEL = "multi-qa-mpnet-base-cos-v1"

In [13]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
import openai
path_to_keys = 'keys.env'
temp = dotenv_values(path_to_keys)
openai_api_key = temp["OPENAI_API_KEY"]

### lets put the data to chroma db

In [14]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [15]:
# !pip install hnswlib==0.7.0 -q
# !pip install chroma-hnswlib==0.7.3 -q
# !pip uninstall hnswlib chroma-hnswlib -y

In [16]:
%pip install chromadb==0.5 tiktoken -q

Note: you may need to restart the kernel to use updated packages.


In [17]:
from langchain.vectorstores import Chroma

def put_in_Chroma(doc_pages, doc_name, embedding_type="openai"):
    whole_text = " ".join(doc_pages)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
    doc = [
                Document(page_content=clean_text(page), metadata={"page": i, "doc_name": doc_name})
                for i, page in enumerate(doc_pages)
            ]
    chunks = text_splitter.split_documents(doc)

    print('chunks: ', len(chunks))
    # Retrieve embedding function from code env resources
    
    if embedding_type == "openai":
        print("Using OpenAI embeddings")
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    else:
        print("Using Sentence Transformer embeddings")
        embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)

    # Index the vector database by embedding then inserting document chunks
    db = Chroma.from_documents(chunks,
                            embedding=embeddings,
                            ids=[str(i) for i in range(len(chunks))],
                            persist_directory=vector_db_path)

    # Save vector database as persistent files in the output folder
    return db

In [86]:
# file_name = 'FRM-AR117-21-1230-2624_2024_163320'
# file_name = 'FRM-AR117-21-1230-2638_2024_162334'
file_name = 'FRM-AR117-22-1252-6330_2024_16400'
file_name = '17-22-1250-8464'
file_name = 'ol-elevated-guideway-and-stations-dmca-redacted-version'

In [17]:
%%time
for i in dataset:
    if i['name'] == file_name:
        doc_pages = i['text']
        break
print('pages: ', len(doc_pages))
db = put_in_Chroma(doc_pages, doc_name=file_name, embedding_type='transformer')

pages:  1344
chunks:  1448
Using Sentence Transformer embeddings




CPU times: user 1min 21s, sys: 4.49 s, total: 1min 26s
Wall time: 58.7 s


In [37]:
def get_gpt_llm():
    chat_params = {
        "model": "gpt-3.5-turbo", # Bigger context window
        "openai_api_key": openai_api_key,
        "temperature": 0.000001, 
    }
    llm = ChatOpenAI(**chat_params)
    return llm

def qa_retriever_openai(query, vector_db_path, file_id, k=4, embeddings_type="openai"):
    if embeddings_type == "openai":
        print("Using OpenAI Embeddings")
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    else:
        print("Using Sentence Transformer Embeddings")
        embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)
    vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)

    retriever = vectordb.as_retriever(search_kwargs={"k": k, "filter": {"doc_name": file_id}})

    qa = RetrievalQA.from_chain_type(llm=get_gpt_llm(), chain_type="stuff", 
                                    retriever=retriever, return_source_documents=True)
    res = qa({"query": query, "k": k})
    return res, retriever

In [22]:
question = "Who are the parties?"
question = "When is this agreement entered into?"
question = "When is this agreement entered into can you give me a quote for evidence?"

# question = "What type of form is that?"

answer, retriever = \
    qa_retriever_openai(question, vector_db_path="/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db", \
    file_id=file_name, k=4, embeddings_type="transformer")

Using Sentence Transformer Embeddings




In [23]:
answer

{'query': 'When is this agreement entered into can you give me a quote for evidence?',
 'k': 4,
 'result': 'This agreement is effective as from the date first stated above, which is the DMCA Effective Date. Unfortunately, there is no specific date provided in the context for the DMCA Effective Date.',
 'source_documents': [Document(metadata={'doc_name': 'ol-elevated-guideway-and-stations-dmca-redacted-version', 'page': 61}, page_content='Representative of testing entities, nor any other thing in this Agreement shall have the effect of\nlimiting or shortening or otherwise affecting in any way whatsoever the duration, effectiveness or\ncontent of any guarantee or warranty set forth in any other document or material forming part of\nthis Agreement.\nWarranty Security\nNot less than 90 days prior to each then-current TPA Scheduled Substantial Completion Date or\nrelevant DMCA Construction Works Scheduled Substantial Completion Date, as applicable,\nConfidential Page 62\nKings Printer for O

In [24]:
print('Openai answer: ', answer['result'])

Openai answer:  This agreement is effective as from the date first stated above, which is the DMCA Effective Date. Unfortunately, there is no specific date provided in the context for the DMCA Effective Date.


ran in terminal: `vllm serve neuralmagic/Llama-2-7b-chat-quantized.w8a8 --chat-template templates/template_chatml.jinja`

In [None]:
inference_server_url = "http://localhost:8000/v1"

# MODEL = "facebook/opt-125m"
# MODEL = "neuralmagic/Llama-2-7b-chat-quantized.w8a8"
MODEL = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    
llm = ChatOpenAI(
    model=MODEL,
    openai_api_key="EMPTY",
    openai_api_base=inference_server_url,
    max_tokens=100,
    temperature=0,
)

In [None]:
def qa_retriever_llama(query, vector_db_path, file_id, k=4):
    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)

    retriever = vectordb.as_retriever(search_kwargs={"k": k, "filter": {"doc_name": file_id}})

    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
                                    retriever=retriever, return_source_documents=True)
    res = qa({"query": query})
    return res, retriever

In [None]:
%%time
question = "Who are the parties?"
question = "When is this agreement entered into?"

answer_llama, retriever = qa_retriever_llama(question, vector_db_path="/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db", file_id=file_name, k=4)

In [None]:
print(answer_llama)
print(answer_llama['result'])

In [None]:
text = '\n--\n'.join([i.page_content for i in answer['source_documents']])
print(text)

In [None]:
from langchain_core.prompts.prompt import PromptTemplate

In [None]:
llm = ChatOpenAI(
    model=MODEL,
    openai_api_key="EMPTY",
    openai_api_base=inference_server_url,
    max_tokens=200,
    temperature=0,
)

In [None]:
prompt = """You are an AI assistant, use the following text to provide answer if you don't know, say you don't know
        Context: {context}
        Question: {question}
        Be concise and short in your response.
"""

# context = text
question = "Who are the parties?"
# question = "Where did the accident occur?"
# question = "What is the date of the accident?"
# question = "Was the denial of claim based on late notice to the carrier?"
# question = "Who is the insurer?"
# question = "What type of form is that?"

# file_name = 'FRM-AR117-22-1252-6330_2024_16400'

vector_db_path = "/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db"
my_prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 4, "filter": {"doc_name": file_name}})

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                chain_type="stuff", 
                                retriever=retriever, 
                                return_source_documents=True,
                                chain_type_kwargs={"prompt": my_prompt})

In [None]:
%%time
llama_answer = qa.invoke(question)

In [None]:
print(llama_answer)
print(llama_answer['result'])

In [None]:
my_prompt = PromptTemplate(template=prompt, input_variables=["context", "question"])
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 4, "filter": {"doc_name": file_name}})
qa = RetrievalQA.from_chain_type(llm=get_gpt_llm(), 
                                chain_type="stuff", 
                                retriever=retriever, 
                                return_source_documents=True,
                                chain_type_kwargs={"prompt": my_prompt})

In [None]:
%%time
openai_answer = qa.invoke(question)

In [None]:
print(openai_answer)
print(openai_answer['result'])

In [None]:
db._collection.get(include=["metadatas","documents"])

In [18]:
import torch
torch.cuda.empty_cache()

--------

In [20]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
number_gpus = 1
max_model_len = 8192

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len, dtype=torch.float16)

INFO 10-07 08:06:25 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantize

OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 40.81 MiB is free. Including non-PyTorch memory, this process has 14.71 GiB memory in use. Of the allocated memory 14.09 GiB is allocated by PyTorch, and 16.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [None]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "write a poem about waterlilies"},
]

prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

-------

Metrics will be calculated with CUAD data

In [18]:
from langchain.vectorstores import Chroma

def put_in_Chroma(doc_pages, doc_name, embedding_type="openai"):
    whole_text = " ".join(doc_pages)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
    doc = [
                Document(page_content=page, metadata={"page": i, "doc_name": doc_name})
                for i, page in enumerate(doc_pages)
            ]
    chunks = text_splitter.split_documents(doc)

    #lets add start and end index to every chunk
    for ch in chunks:
        ch.metadata['start_index'] = whole_text.find(ch.page_content)
        ch.metadata['end_index'] = ch.metadata['start_index']+len(ch.page_content)

    print('chunks: ', len(chunks))
    # Retrieve embedding function from code env resources
    
    if embedding_type == "openai":
        print("Using OpenAI embeddings")
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, openai_api_key=openai_api_key)
    else:
        print("Using Sentence Transformer embeddings")
        embeddings = SentenceTransformerEmbeddings(SENTENCE_TRANSFORMER_MODEL)

    # Index the vector database by embedding then inserting document chunks
    db = Chroma.from_documents(chunks,
                            embedding=embeddings,
                            ids=[str(i) for i in range(len(chunks))],
                            persist_directory=vector_db_path)

    # Save vector database as persistent files in the output folder
    return db

In [19]:
download_file_from_s3('contract-intelligence-data','yulia_data/grant/data/cuad_data/test.json')

File yulia_data/grant/data/cuad_data/test.json downloaded from contract-intelligence-data to test.json


In [20]:
with open('test.json') as f:
    test_data = json.load(f)

In [53]:
test_data['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [54]:
test_data['data'][0]['paragraphs'][0].keys()

dict_keys(['qas', 'context'])

In [21]:
#put files into chroma
for ii, dd in tqdm(enumerate(test_data['data'][:5])):
    print(f"Document {ii}")
    text = dd['paragraphs'][0]['context']
    file_name = dd['title']
    print(f"File name: {file_name}")
    db = put_in_Chroma([text], doc_name=file_name, embedding_type='transformer')

0it [00:00, ?it/s]

Document 0
File name: LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement
chunks:  5
Using Sentence Transformer embeddings


1it [00:06,  6.64s/it]

Document 1
File name: CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT
chunks:  5
Using Sentence Transformer embeddings


2it [00:08,  3.97s/it]

Document 2
File name: DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement
chunks:  63
Using Sentence Transformer embeddings


3it [00:12,  3.91s/it]

Document 3
File name: PACIRA PHARMACEUTICALS, INC. - A_R STRATEGIC LICENSING, DISTRIBUTION AND MARKETING AGREEMENT 
chunks:  52
Using Sentence Transformer embeddings


4it [00:16,  3.72s/it]

Document 4
File name: ThriventVariableInsuranceAccountB_20190701_N-6_EX-99.D(IV)_11720968_EX-99.D(IV)_Endorsement Agreement
chunks:  2
Using Sentence Transformer embeddings


5it [00:17,  3.47s/it]


In [29]:
#cuad formulations are not the best for openai instructions, I will start with a subset of the data
map_questions = {"""Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?""":
                    "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise",
                """Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract""":
                    "What is the Document Name? The name of the contract? give evidence and be concise",
                """Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract""":
                    "Who are the Parties who signed the contract? give evidence and be concise",
                """Highlight the parts (if any) of this contract related to "Governing Law" that should be reviewed by a lawyer. Details: Which state/country's law governs the interpretation of the contract?""":
                    "What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise"}

In [32]:
filtered_questions = []
for dd in test_data['data'][:5]:
    questions = {}
    questions['file_name'] = dd['title']
    for qq in dd['paragraphs'][0]['qas']:
        if qq['question'] in map_questions.keys():
            question = map_questions[qq['question']]
            if 'answers' in qq.keys() and len(qq['answers'])>0:
                questions[question] = qq['answers']
    filtered_questions.append(questions)      

In [34]:
filtered_questions[0]

{'file_name': 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement',
 'What is the Document Name? The name of the contract? give evidence and be concise': [{'text': 'SUPPLY CONTRACT',
   'answer_start': 14}],
 'Who are the Parties who signed the contract? give evidence and be concise': [{'text': 'The seller:',
   'answer_start': 143},
  {'text': 'The buyer/End-User: Shenzhen LOHAS Supply Chain Management Co., Ltd.',
   'answer_start': 49}],
 "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise": [{'text': 'The Contract is valid for 5 years, beginning from and ended on .',
   'answer_start': 10985}],
 "What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise": [{'text': "It will be governed by the law of the People's Republic of China ,otherwise it is governed by United Nations Convention on Contract for the International Sale of Goods.",
 

In [38]:
for k, v in filtered_questions[0].items():
    if k == 'file_name':
        file_name = v
    else:
        question = k
        answer, retriever = \
            qa_retriever_openai(question, vector_db_path="/home/ubuntu/yulia/vllm-exploratory/llm/xplore/chroma_db", \
            file_id=file_name, k=4, embeddings_type="transformer")
        print(answer)
        print(v)

Using Sentence Transformer Embeddings


  vectordb = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
  llm = ChatOpenAI(**chat_params)
  res = qa({"query": query, "k": k})


{'query': 'What is the Document Name? The name of the contract? give evidence and be concise', 'k': 4, 'result': "I don't have enough information to provide a specific answer to your question.", 'source_documents': []}
[{'text': 'SUPPLY CONTRACT', 'answer_start': 14}]
Using Sentence Transformer Embeddings




{'query': 'Who are the Parties who signed the contract? give evidence and be concise', 'k': 4, 'result': "I don't have enough information to provide a specific answer to your question.", 'source_documents': []}
[{'text': 'The seller:', 'answer_start': 143}, {'text': 'The buyer/End-User: Shenzhen LOHAS Supply Chain Management Co., Ltd.', 'answer_start': 49}]
Using Sentence Transformer Embeddings




{'query': "What is the Expiration Date? On what date will the contract's initial term expire? give evidence and be concise", 'k': 4, 'result': "I don't have enough information to provide a specific expiration date for the contract's initial term.", 'source_documents': []}
[{'text': 'The Contract is valid for 5 years, beginning from and ended on .', 'answer_start': 10985}]
Using Sentence Transformer Embeddings




{'query': "What is the Governing Law? Which state/country's law governs the interpretation of the contract? give evidence and be concise", 'k': 4, 'result': 'I don\'t have enough information to provide a specific answer to your question. The governing law of a contract is typically specified within the contract itself. It is usually found in a section called "Governing Law" or "Jurisdiction." You would need to refer to the specific contract in question to determine which state or country\'s law governs its interpretation.', 'source_documents': []}
[{'text': "It will be governed by the law of the People's Republic of China ,otherwise it is governed by United Nations Convention on Contract for the International Sale of Goods.", 'answer_start': 10691}]


In [71]:
answer['result']

"The contract's initial term expires after 5 years from the starting date, but the specific starting date is not provided in the context."

In [72]:
answer['source_documents'][0]

Document(metadata={'doc_name': 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement', 'end_index': 11475, 'page': 0, 'start_index': 10625}, page_content="5\n\nSource: LOHA CO. LTD., F-1, 12/9/2019\n\n\n\n\n\n21. Law application It will be governed by the law of the People's Republic of China ,otherwise it is governed by United Nations Convention on Contract for the International Sale of Goods. 22. <<Incoterms 2000>> The terms in the contract are based on (INCOTERMS 2000) of the International Chamber of Commerce. 23. The Contract is valid for 5 years, beginning from and ended on . This Contract is made out in three originals in both Chinese and English, each language being legally of the equal effect. Conflicts between these two languages arising there from, if any, shall be subject to Chinese version. One copy for the Sellers, two copies for the Buyers. The Contract becomes effective after signed by both parties. THE BUYER: THE SELLER: SIGNATURE: SIGNATURE: 6\n\nSou

In [73]:
answer['source_documents'][2]

Document(metadata={'doc_name': 'LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement', 'end_index': 7951, 'page': 0, 'start_index': 5155}, page_content="3\n\nSource: LOHA CO. LTD., F-1, 12/9/2019\n\n\n\n\n\n12.2 (1) Invoice in 3 originals indicating contract number and L/C number. (2) Final acceptance certificate signed by the Buyer and the Seller. 13. SHIPMENT: CIP The seller shall contract on usual terms at his own expenses for the carriage of the goods to the agreed point at the named place of destination and bear all risks and expenses until the goods have been delivered to the port of destination. The Sellers shall ship the goods within the shipment time from the port of shipment to the port of destination. Transshipment is allowed. Partial Shipment is allowed. In case the goods are to be dispatched by parcel post/sea-freight, the Sellers shall, 3 days before the time of delivery, inform the Buyers by cable/letter of the estimated date of delivery, Contract No.,

In [74]:
# we need to make sure that the location of evidence from cuad and the selected top chunks overlap

In [75]:
# 1/0 function (whether evidence is in the chunk based on indices) and then a MAP@k function

# for the actual xetraction calculate perpelexity of the evidence in the context

In [76]:
# get squad metric for the actual words or semantic similarity score

In [77]:
%pip install evaluate -q

Note: you may need to restart the kernel to use updated packages.


In [78]:
from evaluate import load
metric = load("squad")

In [79]:
metric.compute(predictions=[{"id": '1', "prediction_text": "The contract's initial term expires after 5 years, but the specific date is not provided in the context."},
                            {"id": '2', "prediction_text": "The contract's initial term expires after 5 years, but the specific date is not provided in the context.1 ."}], 
                references=[{"answers": {"answer_start": [0], 'text': ["The Contract is valid for 5 years, beginning from and ended on ."]}, "id": '1'},
                            {"answers": {"answer_start": [0], "text": ["The Contract is valid for 5 years, beginning from and ended on 1 ."]}, "id": '2'}])

{'exact_match': 0.0, 'f1': 22.64957264957265}

In [40]:
def dcg(vect, k=4):
    res = 0
    for i in range(1, k+1):
        res+=vect[i-1]/np.log2(i+1)
    return res

In [83]:
ans_strt = context[0]['answer_start']
res= [1 if ans_strt >= i.metadata['start_index'] and ans_strt <= i.metadata['end_index'] else 0 for i in answer['source_documents']]
res

[1, 0, 0, 0]

In [44]:
dcg([0,0,0,1])

0.43067655807339306

In [26]:
dcg_score([[0,0,1,0]], [[1,0,0,0]], k=4)

0.5205354372149502

In [20]:
import numpy as np
1/np.log(4)

0.7213475204444817

In [21]:
1/np.log(3)

0.9102392266268375