In [3]:
import re
from PyPDF2 import PdfReader

reader = PdfReader("Datasheet_for_Contract_Understanding_Atticus_Dataset_(CUAD)_v1.pdf")
all_text = []

for i, page in enumerate(reader.pages):
    try:
        page_text = page.extract_text()

        if page_text:
            # Cleanup starts here
            page_text = page_text.replace("\n", " ").replace("  ", " ")
            all_text.append(page_text)
        else:
            print(f"⚠️ No text extracted from page {i+1}")

    except Exception as e:
        print(f"❌ Error on page {i+1}: {e}")

# JOIN ALL PAGES
raw_text = "\n".join(all_text)

print(raw_text[:1000])


Datasheet for Contract Understanding Atticus Dataset (CUAD)   I. MOTIVATION   A. Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g. company,                  institution, organization)?   The Atticus Project is a non-profit organization whose mission is to harness the power of AI to accelerate                   accurate and efficient contract review. The Atticus Project started as a grassroots movement by experienced                lawyers in public companies and leading law firms aiming to achieve high-quality, low-cost, accurate and timely                 contract review using AI. It was officially incorporated as a California nonprofit public benefit corporation in                 January 2020.   B. Did they fund it themselves? If there is an associated grant, please provide the name of the grantor and                    the grant name and number.   The Atticus Project relies 100% on unpaid volunteers who are organized around the single mi

In [4]:
# Remove lines that are repeated too often (e.g., headers/footers)
def remove_repeated_lines(text):
    lines = text.split("\n")
    unique = []
    seen = set()

    for line in lines:
        normalized = line.strip()
        if len(normalized) > 0:
            if lines.count(line) > 3: 
                continue
        unique.append(line)

    return "\n".join(unique)

raw_text = remove_repeated_lines(raw_text)

# General text cleaning
def clean_text(text):
    text = re.sub(r"\s+", " ", text)          
    text = re.sub(r"[\x00-\x1F\x7F]", "", text)  
    text = text.replace("­", "")              
    return text

processed_text = clean_text(raw_text)





In [5]:
# Fix hyphenated words at line breaks
processed_text = re.sub(r"(\w+)-\s+(\w+)", r"\1\2", processed_text)


In [6]:
# Restore headings in all caps
def restore_headings(text):
    text = re.sub(r"(?<=\n)([A-Z][A-Z ]{3,})(?=\n)", r"\n## \1\n", text)
    return text

processed_text = restore_headings(processed_text)


In [7]:
processed_text[:1000]

'Datasheet for Contract Understanding Atticus Dataset (CUAD) I. MOTIVATION A. Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g. company, institution, organization)? The Atticus Project is a non-profit organization whose mission is to harness the power of AI to accelerate accurate and efficient contract review. The Atticus Project started as a grassroots movement by experienced lawyers in public companies and leading law firms aiming to achieve high-quality, low-cost, accurate and timely contract review using AI. It was officially incorporated as a California nonprofit public benefit corporation in January 2020. B. Did they fund it themselves? If there is an associated grant, please provide the name of the grantor and the grant name and number. The Atticus Project relies 100% on unpaid volunteers who are organized around the single mission of changing the legal industry by leveraging AI. C. For what purpose was the data set created? Was there

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import JsonOutputParser
from langchain_chroma import Chroma
parser = JsonOutputParser()
load_dotenv()

True

In [None]:
def chunk_contract(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,        
        chunk_overlap=200,     
        length_function=len,
        separators=[
            "\n\n",   
            "\n",   
            ". ",     
            " ",     
            ""      
        ]
    )
    return splitter.split_text(text)


In [10]:
model = ChatOpenAI(temperature=0)

In [11]:
chunks = chunk_contract(processed_text)
print(f"Total chunks: {len(chunks)}")


Total chunks: 34


In [12]:
clause_extraction_prompt = PromptTemplate(
    template="""
You are a legal analysis AI specializing in contract clause extraction.

TASK:
Extract the following clauses from the contract text. Return them EXACTLY as written in the document.

Document:
{text}

You must extract FULL CLAUSE BLOCKS. A clause block:
- starts at a clause heading or number (e.g., “12. Termination” or “Confidentiality”)
- continues until the next heading or section number

Extract in this JSON format:

{{
  "termination_conditions": "",
  "confidentiality_clauses": "",
  "liability_clauses": ""
}}

Rules:
- Extract the **entire clause**, not a sentence fragment.
- Ignore SEC boilerplate text like:
  “THIS EXHIBIT HAS BEEN REDACTED…”
- If multiple clauses exist, include the primary.
- If a clause does not exist, set its value to null.
- DO NOT rewrite or paraphrase.
- DO NOT add comments.
- Output VALID JSON only.
""",
    input_variables=["text"]
)


In [13]:
summary_prompt = PromptTemplate(
    template="""
You are an expert contract summarizer.

Write a clear and concise summary (100-150 words) of the contract below.

Your summary must include:
- Purpose of the agreement.
- Key obligations of each party.
- Any important risks, penalties, or termination conditions.

Do not quote the contract directly. Use plain English.
Return only the summary text.
- Output JSON only.

Contract:
{text}
""",
    input_variables=["text"]
)

In [14]:
extraction_chain = clause_extraction_prompt | model |parser

In [15]:
summary_chain = summary_prompt | model |parser

In [16]:
extraction_chain.invoke({"text": processed_text})

{'termination_conditions': 'VI. DISTRIBUTION A. Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created? If so, please provide a description. Yes. The dataset is open sourced and publicly available.',
 'confidentiality_clauses': 'F. Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g. because it was unavailable). Some clauses in the files are redacted because the party submitting these contracts redacted them to protect confidentiality.',
 'liability_clauses': None}

In [17]:
summary_chain.invoke({"text": processed_text})


{'summary': "The Atticus Project, a non-profit organization, created the Contract Understanding Atticus Dataset (CUAD) to develop AI algorithms for contract review in corporate transactions. The dataset includes 510 commercial contracts sourced from the SEC's EDGAR system, with 41 categories of legal clauses labeled by experienced lawyers. The dataset is open-source under a Creative Commons license and can be used for developing AI tools to aid legal professionals. The dataset should not be used for contract drafting, management, dispute resolution, or legal advice. Updates and maintenance will be communicated on The Atticus Project website. Contributions to the dataset are welcome and can be submitted via email.",
 'word_count': 100}

Semantic search over clauses using embeddings.

In [18]:
# embeddings for semantic search
embedding_model = OpenAIEmbeddings()
vector_store = Chroma.from_texts(
    texts=chunks,
    embedding=embedding_model,
    persist_directory="chroma_store"
)


In [None]:
# querying the vector store
user_input = input("Enter your query: ")
query = user_input
results = vector_store.similarity_search(query, k=1)

for r in results:
    print(r.page_content)

. (“Party A”); Party B Corp. (“Party B”)”. Some sentences in the files include confidential legends that are not part of the contracts. An example of such confidential legend is as follows: THIS EXHIBIT HAS BEEN REDACTED AND IS THE SUBJECT OF A CONFIDENTIAL TREATMENT REQUEST. REDACTED MATERIAL IS MARKED WITH [* * *] AND HAS BEEN FILED SEPARATELY WITH THE SECURITIES AND EXCHANGE COMMISSION. Some sentences in the files contain irrelevant information such as footers or page numbers. Some sentences may not be relevant to the corresponding category. Some sentences may correspond to a different category. Because many legal clauses are very long and contain various sub-parts, sometimes only a sub-part of a sentence is responsive to a category. To address the foregoing limitations, annotators manually deleted the portion that is not responsive, replacing it with the symbol "<omitted>" to indicate that the two text segments do not appear immediately next to each other in the contracts
