In [1]:
%reload_ext autoreload
%autoreload 2

# Installing dependencies

In [2]:
!pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap
!pip install datasets
!pip install pypdf
!pip install openai
!pip install chromadb
!pip install langchain
!pip install tiktoken
!pip install -q -U datasets
!pip install -q -U tqdm
!pip install ragas
!pip install dataset
!pip install tqdm ragas tensorflow tensorflow-datasets pandas numpy scikit-learn



In [3]:
# Import libraries
import pandas as pd
import os
import datasets
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.document_loaders import PyPDFLoader

# Loading Document

In [4]:
import os

file_path = "/content/Raptor Contract.docx.pdf"

if not os.path.isfile(file_path):
    print("File does not exist.")

In [5]:
if not file_path.endswith(".pdf"):
    print("File is not a PDF file.")

In [6]:
# Load the dataset
loader = PyPDFLoader("/content/Raptor Contract.docx.pdf")
documents = loader.load()

# Getting api key

In [7]:

import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key: ··········


# storing a vector data

In [8]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=10,separators=["\n\n","\n",".", " ",""])

docs = text_splitter.split_documents(documents)

vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

  warn_deprecated(


In [9]:
len(docs)

491

In [10]:
base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 50})

# prompt

In [11]:
from langchain.prompts import ChatPromptTemplate

template = """You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context,':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Defining a Retrieval-Augmented Question-Answering Chain with Langchain

In [12]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

  warn_deprecated(


# Testing the RAG

In [19]:
questions = [
    "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?",
    "How much is the escrow amount?",
    "Is escrow amount grete then the Retention Amount ?",
    "What is the purpose of the escrow?",
    "May the Escrow Amount serve as a recourse for the Buyer in case of breach of representations by the Company?",
    "Are there any conditions to the closing?",
    "Are Change of Control Payments considered a Seller Transaction Expense?",
    "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is",
    "Does the Buyer need to pay the Employees Closing Bonus Amount directly to the Companyâs employees?",
    "Does any of the Sellers provide a representation with respect to any Tax matters related to the Company?",
    "Is any of the Sellers bound by a non-competition covenant after the Closing?",
    "Whose consent is required for the assignment of the Agreement by the Buyer?",
    "Does the Buyer needs the Sellersâ consent in the event of an assignment of the Agreement to a third party who is not a Buyerâs Affiliates?"
]


In [20]:
answer=[]
for question in questions:
  result = retrieval_augmented_qa_chain.invoke({"question" : question})
  answer.append(result['response'].content)




In [21]:
answer

['Answer: The Sellers are responsible for a breach of representations and warranties under the following circumstances:\n1. Each Seller is severally responsible for any breach or violation of the provisions of the agreement by any of its Affiliates or its Representatives.\n2. The Sellers are responsible for any breach or violation of representations and warranties made to the Buyer.\n3. The Sellers are responsible for any breach or violation of representations, warranties, or covenants contained in the agreement, regardless of the relative levels of specificity.\n4. The Sellers are responsible for any breach of legal compliance, illegal payments, or permits related to the acquired companies.\n5. The Sellers are responsible for any breach of representations and warranties unless specifically exempted or limited in the agreement.',
 'The escrow amount is $1,000,000.',
 'Based on the provided context, the Retention Amount is defined as an amount equal to $5,000,000. The Escrow Amount is d

# Ground Truth Dataset Creation Using GPT-4

In [22]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [23]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [24]:
question_generation_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [163]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context,':

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [164]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What is the purpose of the document?
context
page_content='[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nAGREEMENT\nBY\nAND\nAMONG\n[BUYER],\n[TARGET\nCOMP ANY],\nTHE\nSELLERS\nLISTED\nON\nSCHEDULE\nI\nHERET O\nAND\nTHE\nSELLERS’\nREPRESENT ATIVE\nNAMED\nHEREIN\nDated\nas\nof\n[●]\n[This\ndocument\nis\nintended\nsolely\nto\nfacilitate\ndiscussions\namong\nthe\nparties\nidentified\nherein. \nNeither\nthis\ndocument\nnor\nsuch\ndiscussions\nare\nintended\nto\ncreate,\nnor\nwill\neither\nor\nboth\nbe \ndeemed\nto\ncreate,\na\nlegally\nbinding\nor\nenforceable\noffer\nor\nagreement\nof\nany\ntype\nor\nnature, \nunless\nand\nuntil\na' metadata={'source': '/content/Raptor Contract.docx.pdf', 'page': 0}


In [165]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(docs[:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 10/10 [00:13<00:00,  1.35s/it]


In [166]:
qac_triples[5]

{'question': 'What is the context about?',
 'context': Document(page_content='Section\n3.19\nLabor\nMatters\n37\nSection\n3.20\nLitigation;\nGovernment\nOrders\n37\nSection\n3.21\nInsurance\n38\nSection\n3.22\nNo\nBrokers\n38\nSection\n3.23\nFull\nDisclosure\n38\nARTICLE\nIV\nINDIVIDUAL\nREPRESENT ATIONS\nAND\nWARRANTIES\nOF\nTHE \nSELLERS.\n38\nSection\n4.01\nOrganization\n39\nSection\n4.02\nPower\nand\nAuthorization\n39\nSection\n4.03\nAuthorization\nof\nGovernmental\nAuthorities\n39\nSection\n4.04\nNoncontravention\n39\nSection\n4.05\nTitle\n39\nSection\n4.06\nNo\nBrokers\n40\nARTICLE\nV\nREPRESENT ATIONS\nAND\nWARRANTIES\nOF\nTHE\nBUYER.', metadata={'source': '/content/Raptor Contract.docx.pdf', 'page': 2})}

In [167]:
answer_generation_llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context,':

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [168]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 10/10 [00:42<00:00,  4.25s/it]


In [169]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [170]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 10
})

In [171]:
from tqdm import tqdm
import pandas as pd
from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)

100%|██████████| 10/10 [00:18<00:00,  1.84s/it]


In [172]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 10
})

In [173]:
basic_qa_ragas_dataset[1]

{'question': 'What is the purpose of the document mentioned in the context?',
 'answer': 'Answer: The purpose of the document mentioned in the context is to outline an Agreement between parties, including representations, warranties, and covenants, in order to induce the Buyer to enter into and perform the Agreement and to consummate the Contemplated Transactions.',
 'contexts': ['of\nthe\nabove-named \ncourts,\nthat\nits\nproperty\nis\nexempt\nor\nimmune\nfrom\nattachment\nor\nexecution,\nthat\nany\nsuch\nAction \nbrought\nin\none\nof\nthe\nabove-named\ncourts\nshould\nbe\ndismissed\non\ngrounds\nof\nforum\nnon \nconveniens\n,\nshould\nbe\ntransferred\nor\nremoved\nto\nany\ncourt\nother\nthan\none\nof\nthe\nabove-named \ncourts,\nor\nshould\nbe\nstayed\nby\nreason\nof\nthe\npendency\nof\nsome\nother\nAction\nin\nany\nother\ncourt \nother\nthan\none\nof\nthe\nabove-named\ncourts\nor\nthat\nthis\nAgreement,\nany\nAncillary\nAgreement\nor \nthe\nsubject',
  '[Remainder\nof\npage\nintenti

In [174]:
# extract the question from eval_dataset

questionss = eval_dataset["question"]
questionss

['What is the purpose of the document?',
 'What is the purpose of the document mentioned in the context?',
 'What is the content of the provided context?',
 'What is the purpose of the document?',
 'What does Section 3.16 of the contract discuss?',
 'What is the context about?',
 'What is the content of Section 6.02?',
 'What is the content of page 2 in the provided context?',
 "What are the provisions concerning the Sellers' Representative?",
 'What is the content of page 4 of the document?']

RAG's answer for the generated **questions**

In [175]:
eval_data = eval_dataset.to_csv("groundtruth_eval_dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [176]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("groundtruth_eval_dataset.csv")




df = df.rename(columns={'ground_truth':'ground_thruths','context':'contexts'})
# Process the data
# Assuming you have a list of answers called answers_list
df['answer'] = answer1  # Replace 'answers_list' with your actual list of answers

dtype={'contexts':str}
df['contexts']=df['contexts'].tolist()
# Write the processed data to a new CSV file
df.to_csv("ragas_data.csv", index=False)

In [177]:
df = pd.read_csv('ragas_data.csv')
df.head()

Unnamed: 0,question,contexts,ground_thruths,answer
0,What is the purpose of the document?,[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nAGR...,The purpose of the document is to outline the ...,Answer: The purpose of the document is to faci...
1,What is the purpose of the document mentioned ...,until\na\ndefinitive\nwritten\nagreement\nis\n...,The purpose of the document mentioned in the c...,Answer: The purpose of the document in the pro...
2,What is the content of the provided context?,TABLE\nOF\nCONTENTS\nARTICLE\nI\nDEFINITIONS;\...,The content of the provided context appears to...,Answer: The purpose of the document provided i...
3,What is the purpose of the document?,Section\n2.08\nEscrow\n19\nARTICLE\nIII\nREPRE...,The purpose of the document appears to be an a...,The context provided is a legal document relat...
4,What does Section 3.16 of the contract discuss?,Section\n3.11\nIntellectual\nProperty\n26\nSec...,Section 3.16 of the contract discusses 'Contra...,Answer: The content of Section 3.16 in the pro...


In [178]:
type(df['question'])

In [179]:
from datasets import Dataset



ragas_dataset = Dataset.from_csv('ragas_data.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [180]:
ragas_dataset

Dataset({
    features: ['question', 'contexts', 'ground_thruths', 'answer'],
    num_rows: 10
})

## Evaluation Using RAGAS

In [181]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

In [182]:
from ragas import evaluate
def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [183]:
basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

245493

In [184]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)



Evaluating:   0%|          | 0/70 [00:00<?, ?it/s]

In [186]:
basic_qa_result

{'context_precision': 0.2034, 'faithfulness': 1.0000, 'answer_relevancy': 0.6939, 'context_recall': 0.8500, 'context_relevancy': 0.0032, 'answer_correctness': 0.6101, 'answer_similarity': 0.9095}