In [265]:
%reload_ext autoreload
%autoreload 2

# Installing dependencies

In [266]:
!pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap
!pip install datasets
!pip install pypdf
!pip install openai
!pip install chromadb
!pip install langchain
!pip install tiktoken
!pip install -q -U datasets
!pip install -q -U tqdm
!pip install ragas
!pip install dataset
!pip install tqdm ragas tensorflow tensorflow-datasets pandas numpy scikit-learn



In [267]:
# Import libraries
import pandas as pd
import os
import datasets
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.document_loaders import PyPDFLoader

# Loading Document

In [268]:
import os

file_path = "/content/Raptor Contract.docx.pdf"

if not os.path.isfile(file_path):
    print("File does not exist.")

In [269]:
if not file_path.endswith(".pdf"):
    print("File is not a PDF file.")

In [270]:
# Load the dataset
loader = PyPDFLoader("/content/Raptor Contract.docx.pdf")
documents = loader.load()

# Getting api key

In [271]:

import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key: ··········


# storing a vector data

In [272]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=10,separators=["\n\n","\n",".", " ",""])

docs = text_splitter.split_documents(documents)

vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

In [273]:
len(docs)

491

In [274]:
base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 50})

# prompt

In [275]:
from langchain.prompts import ChatPromptTemplate

template = """You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context,':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Defining a Retrieval-Augmented Question-Answering Chain with Langchain

In [276]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

# Testing the RAG

In [277]:
questions = [
    "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?",
    "How much is the escrow amount?",
    "Is escrow amount grete then the Retention Amount ?",
    "What is the purpose of the escrow?",
    "May the Escrow Amount serve as a recourse for the Buyer in case of breach of representations by the Company?",
    "Are there any conditions to the closing?",
    "Are Change of Control Payments considered a Seller Transaction Expense?",
    "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is",
    "Does the Buyer need to pay the Employees Closing Bonus Amount directly to the Companyâs employees?",
    "Does any of the Sellers provide a representation with respect to any Tax matters related to the Company?",
    "Is any of the Sellers bound by a non-competition covenant after the Closing?",
    "Whose consent is required for the assignment of the Agreement by the Buyer?",
    "Does the Buyer needs the Sellersâ consent in the event of an assignment of the Agreement to a third party who is not a Buyerâs Affiliates?"
]


In [278]:
answer=[]
for question in questions:
  result = retrieval_augmented_qa_chain.invoke({"question" : question})
  answer.append(result['response'].content)




In [279]:
answer

["The Sellers are responsible for a breach of representations and warranties under the following circumstances:\n1. If the breach is due to the Sellers' Representative's gross negligence, bad faith, or willful misconduct.\n2. The Sellers' Representative is not personally liable for any obligations of the Sellers under the agreement, except in cases of gross negligence, bad faith, or willful misconduct.\n3. Each Seller will severally (and not jointly) indemnify the Sellers' Representative from any losses arising out of its service as the Sellers' Representative, except for losses caused by the Sellers' Representative's gross negligence, bad faith, or willful misconduct.",
 'The escrow amount is not explicitly stated in the provided context. The exact amount of the escrow is not specified in the text.',
 'Based on the provided context, it is not explicitly stated whether the escrow amount is greater than the retention amount. The information in the context primarily focuses on the proced

# Ground Truth Dataset Creation Using GPT-4

In [287]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [288]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [289]:
question_generation_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [290]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context,':

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [291]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What is the purpose of the document?
context
{'page_content': "[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nAGREEMENT\nBY\nAND\nAMONG\n[BUYER],\n[TARGET\nCOMP ANY],\nTHE\nSELLERS\nLISTED\nON\nSCHEDULE\nI\nHERET O\nAND\nTHE\nSELLERS’\nREPRESENT ATIVE\nNAMED\nHEREIN\nDated\nas\nof\n[●]\n[This\ndocument\nis\nintended\nsolely\nto\nfacilitate\ndiscussions\namong\nthe\nparties\nidentified\nherein. \nNeither\nthis\ndocument\nnor\nsuch\ndiscussions\nare\nintended\nto\ncreate,\nnor\nwill\nneither\nnor\nboth\nbe \ndeemed\nto\ncreate,\na\nlegally\nbinding\nor\nenforceable\noffer\nor\nagreement\nof\nany\ntype\nnor\nnature, \nunless\nand\nuntil\na'", 'metadata': {'source': '/content/Raptor Contract.docx.pdf', 'page': 0}}


In [292]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(docs[:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 10/10 [00:04<00:00,  2.44it/s]


In [293]:
qac_triples[5]

{'question': 'What is the purpose of the document?',
 'context': Document(page_content='Section\n3.19\nLabor\nMatters\n37\nSection\n3.20\nLitigation;\nGovernment\nOrders\n37\nSection\n3.21\nInsurance\n38\nSection\n3.22\nNo\nBrokers\n38\nSection\n3.23\nFull\nDisclosure\n38\nARTICLE\nIV\nINDIVIDUAL\nREPRESENT ATIONS\nAND\nWARRANTIES\nOF\nTHE \nSELLERS.\n38\nSection\n4.01\nOrganization\n39\nSection\n4.02\nPower\nand\nAuthorization\n39\nSection\n4.03\nAuthorization\nof\nGovernmental\nAuthorities\n39\nSection\n4.04\nNoncontravention\n39\nSection\n4.05\nTitle\n39\nSection\n4.06\nNo\nBrokers\n40\nARTICLE\nV\nREPRESENT ATIONS\nAND\nWARRANTIES\nOF\nTHE\nBUYER.', metadata={'source': '/content/Raptor Contract.docx.pdf', 'page': 2})}

In [294]:
answer_generation_llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context,':

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [295]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 10/10 [00:58<00:00,  5.85s/it]


In [297]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [298]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 10
})

In [299]:
# extract the question from eval_dataset

questionss = eval_dataset["question"]
questionss

['What is the purpose of the document?',
 'What is the purpose of the document?',
 'What is the purpose of the document?',
 'What is the purpose of the document?',
 'What is the content of Section 3.16?',
 'What is the purpose of the document?',
 'What is the purpose of the document?',
 'What is the content of page 2 of the document?',
 'What are the sections mentioned in the context?',
 'What is the context about?']

RAG's answer for the generated **questions**

In [300]:
answer1=[]
for question in questionss:
  result = retrieval_augmented_qa_chain.invoke({"question" : question})
  answer1.append(result['response'].content)

In [301]:
len(answer1)

10

In [302]:
eval_data = eval_dataset.to_csv("groundtruth_eval_dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [303]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("groundtruth_eval_dataset.csv")


df['context'] = df['context'].apply(lambda x: [x])

df = df.rename(columns={'groundtruth':'groundthruths','context':'contexts'})
# Process the data
# Assuming you have a list of answers called answers_list
df['answer'] = answer1  # Replace 'answers_list' with your actual list of answers

dtype={'contexts':str}
df['contexts']=df['contexts'].tolist()
# Write the processed data to a new CSV file
df.to_csv("ragas_data.csv", index=False)

In [304]:
df = pd.read_csv('ragas_data.csv')
df.head()

Unnamed: 0,question,contexts,ground_truth,answer
0,What is the purpose of the document?,['[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nA...,The purpose of the document is to outline the ...,Answer: The purpose of the document is to outl...
1,What is the purpose of the document?,['until\na\ndefinitive\nwritten\nagreement\nis...,The purpose of the document is to outline the ...,The purpose of the document is to outline that...
2,What is the purpose of the document?,['TABLE\nOF\nCONTENTS\nARTICLE\nI\nDEFINITIONS...,The purpose of the document appears to be a co...,Answer: The purpose of the document is to outl...
3,What is the purpose of the document?,['Section\n2.08\nEscrow\n19\nARTICLE\nIII\nREP...,The purpose of the document appears to be an a...,Answer: The purpose of the document is to outl...
4,What is the content of Section 3.16?,['Section\n3.11\nIntellectual\nProperty\n26\nS...,The content of Section 3.16 is 'Contracts'. Ho...,Answer: The content of Section 3.16 is related...


In [305]:
from datasets import Dataset

ragas_dataset = Dataset.from_csv('ragas_data.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [306]:
ragas_dataset

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'answer'],
    num_rows: 10
})

## Evaluation Using RAGAS

In [307]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

In [308]:
from ragas import evaluate
def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [310]:
result = evaluate_ragas_dataset(ragas_dataset)

ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Value'>

In [309]:
import tensorflow_datasets as tfds

import tensorflow as tf
from tensorflow_datasets.core.features import Features


# Convert the contexts feature to a sequence of
contexts_feature = tfds.features.Sequence({"feature": tfds.features.Text()})

# Define the features of the
features = tfds.features.Features({
    "question": tfds.features.Text(),
    "contexts": contexts_feature,
    "answer": tfds.features.ClassLabel(names=['No', 'Yes']),
    "ground_truth": tfds.features.ClassLabel(names=['No', 'Yes'])
})

# Convert the dataset to a TensorFlow dataset with the updated features
ragas_dataset = tfds.as_dataframe(ragas_dataset,
                                   column_names=['question', 'contexts', 'answer', 'ground_truth'],
                                   header=True,
                                   separator=',',
                                   features=features)

# Evaluate the dataset
result = evaluate_ragas_dataset(ragas_dataset)

ImportError: cannot import name 'Features' from 'tensorflow_datasets.core.features' (/usr/local/lib/python3.10/dist-packages/tensorflow_datasets/core/features/__init__.py)

In [None]:

result = evaluate_ragas_dataset(ragas_dataset)

ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Value'>