In [4]:
from paddleocr import PaddleOCR, draw_ocr

pdf_path = r"/home/ajay/contracts_v2/Sample Agreements/Celebrity Contract_3.pdf"

from backend.Doc_Processor.processors.pdf_processor import PDFProcessor
from backend.contract_analyzer.config import Config


processor_config = {
    "pdf": {
        "ocr_enabled": Config.PROCESSOR_CONFIG.ocr_enabled,
        "language": Config.PROCESSOR_CONFIG.language,
        "dpi": Config.PROCESSOR_CONFIG.dpi,
    },
    "image": {
        "ocr_language": Config.PROCESSOR_CONFIG.language,
        "preprocessing_steps": ["denoise", "deskew", "contrast"],
    },
    "structured": {"schema_validation": True},
}

pdf_processor = PDFProcessor(config=processor_config["pdf"])
res = pdf_processor.process(file_path=pdf_path)




Processing PDF file: /home/ajay/contracts_v2/Sample Agreements/Celebrity Contract_3.pdf


100%|██████████| 5/5 [00:23<00:00,  4.78s/it]


In [8]:
from docx import Document
from pathlib import Path
from typing import Dict , Any


class StructuredProcessor():
    SUPPORTED_FORMATS = {
        '.json': 'json',
        '.xml': 'xml',
        '.xlsx': 'excel',
        '.xls': 'excel',
        '.md': 'markdown',
        '.txt': 'text',
        '.docx': 'docx',  # Added support for .docx
        '.doc': 'docx'  # Added support for .doc    
    }
    
    
    
    def _process_docx(self, file_path: Path) -> Dict[str, Any]:
        """Process DOCX files."""
        doc = Document(file_path)
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
        word_count = sum(len(para.split()) for para in paragraphs)
        
        return {
            'content': '\n'.join(paragraphs),
            'metadata': {
                'format': 'docx',
                'size': file_path.stat().st_size,
                'paragraphs': len(paragraphs),
                'word_count': word_count
            }
        }


In [50]:
path = r"/home/ajay/contracts_v2/Sample Agreements/Confidentiality and Non-Disclosure Agreement.docx"
sp = StructuredProcessor()
res = sp._process_docx(Path(path))

In [51]:
print(res["content"])

Exhibit (e)(2)
CONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT
AGREEMENT (this “Agreement”) is between The Pep Boys – Manny, Moe & Jack (on its own behalf and on behalf of its subsidiaries and affiliates, “Pep Boys”) and Icahn Enterprises L.P. (on its own behalf and on behalf of its subsidiaries and controlled affiliates, “Interested Party”), dated December 8, 2015.
WHEREAS, on December 7, 2015, Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash (as set forth in Pep Boys filing with the Securities and Exchange Commission dated December 7, 2015), which Pep Boys Board of Directors determined on December 7, 2015, in accordance with its obligations under that certain Agreement and Plan of Merger, dated as of October 26, 2015 (the “Bridgestone Agreement”), by and among Pep Boys, Bridgestone Retail Operations, LLC, a Delaware limited liability company (“Parent”), TAJ Acquisition Co., a Pennsylvania corporation and wholly-owned subsidiary of Parent, would reaso

In [13]:
def extract_document_structure(text):
    """
    Extract document structure into a dictionary with sections, subsections, and content.
    
    Args:
        text (str): Input text to process
        
    Returns:
        dict: Nested dictionary with document structure
    """
    # Split text into lines
    lines = text.split('\n')
    
    # Initialize variables
    structure = {}
    current_section = None
    current_subsection = None
    current_content = []
    
    def is_main_heading(line):
        """Check if line is a main heading (starts with capital letter, no period at end)"""
        return (line.strip() and 
                line.strip()[0].isupper() and 
                not line.strip().endswith('.') and 
                len(line.strip().split()) == 1)
    
    def is_subheading(line):
        """Check if line is a subheading (contains descriptive text after heading)"""
        return (line.strip() and 
                line.strip()[0].isupper() and 
                ':' in line)
    
    # Process each line
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if is_main_heading(line):
            # If there was previous content, save it
            if current_section and current_content:
                if current_subsection:
                    structure[current_section][current_subsection] = ' '.join(current_content)
                else:
                    # For sections without subsections, use "General" as subsection
                    structure[current_section] = {"General": ' '.join(current_content)}
            
            current_section = line
            current_subsection = None
            current_content = []
            if current_section not in structure:
                structure[current_section] = {}
            
        elif is_subheading(line):
            # If there was previous content, save it
            if current_section and current_subsection and current_content:
                structure[current_section][current_subsection] = ' '.join(current_content)
            
            parts = line.split(':', 1)
            current_subsection = parts[0].strip()
            current_content = [parts[1].strip()] if len(parts) > 1 and parts[1].strip() else []
            
        elif current_section:
            current_content.append(line)
            
    # Save final content if exists
    if current_section and current_content:
        if current_subsection:
            structure[current_section][current_subsection] = ' '.join(current_content)
        else:
            structure[current_section] = {"General": ' '.join(current_content)}
    
    return structure

result = extract_document_structure(res["content"])

# Print the result in a formatted way
import json
print(json.dumps(result, indent=2))

{}


In [29]:
import re
import json
from collections import OrderedDict

def split_contract_sections(contract_text):
    """
    Splits a contract text into sections and returns them as an ordered JSON.

    Args:
        contract_text: The contract text as a string.

    Returns:
        A JSON string representing the contract sections as an ordered dictionary.
        Returns an empty JSON object if no sections are identified.
    """

    if not contract_text:
        return json.dumps({})  # Return empty JSON for empty input

    section_headings = [
        r"^\s*(\d+)\.\s*([A-Z].*)?",  # Numbered sections like 1., 2., (with optional title after)
        r"ARTICLE\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)",  # ARTICLE 1 AGREEMENT, ARTICLE II: DEFINITIONS, ARTICLE 3- TERM
        r"SECTION\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)",  # SECTION 1.1 Definitions, SECTION 2- Term
        r"CLAUSE\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)",   # CLAUSE 1: Confidentiality
        r"ITEM\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)",    # ITEM 1. Governing Law
        r"CHAPTER\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)", # CHAPTER 1: Scope
        r"PART\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)",   # PART 1 - Introduction
        r"SCHEDULE\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)", # SCHEDULE 1: Payment Terms
        r"APPENDIX\s+\d+\s*[:\.\-]?\s*([A-Z\s]+)",# APPENDIX A: Exhibit
        r"EXHIBIT\s+[A-Z]\s*[:\.\-]?\s*([A-Z\s]+)", # EXHIBIT A: Services
        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*$", # Standalone Title Case Headings (less reliable, use cautiously) e.g., Definitions, Term and Termination
        r"(\d+\.\s+[A-Z][a-zA-Z\s]+)", # Numbered headings like 1. Definitions, 2. Term (less common now, but kept for broader coverage)
        r"(\([a-z]\)\s+[A-Z][a-zA-Z\s]+)", # Lettered headings like (a) Definitions, (b) Term
        r"([A-Z][a-z]+\s+[A-Z][a-z]+):", # Headings ending with colon (Definitions: , Term and Termination:)
        r"([A-Z][a-z]+\s+[A-Z][a-z]+)\." # Headings ending with period (Definitions. , Term and Termination.)
        # Add more patterns as needed based on your contract types
    ]

    sections = OrderedDict()
    current_section_name = "Preamble"  # Default section for text before the first heading
    current_section_text = ""

    lines = contract_text.strip().splitlines()

    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        is_section_heading = False
        for pattern in section_headings:
            match = re.match(pattern, line, re.IGNORECASE)
            if match:
                is_section_heading = True
                if pattern == r"^\s*(\d+)\.\s*([A-Z].*)?": # Special handling for numbered sections
                    section_number = match.group(1).strip()
                    section_title = match.group(2).strip() if match.group(2) else section_number # Use number as title if no title after number
                    current_section_name = f"Section {section_number}" # Section 1, Section 2, etc.
                else:
                    section_title = match.group(1).strip() if len(match.groups()) > 0 and match.group(1) else line.strip() # Extract title or use the whole line as title if group 1 is not available
                    current_section_name = section_title

                sections[current_section_name] = current_section_text.strip() if current_section_text.strip() else "" # Save previous section
                current_section_text = ""
                break # Found a heading, no need to check other patterns

        if not is_section_heading:
            current_section_text += line + "\n"

    sections[current_section_name] = current_section_text.strip() if current_section_text.strip() else "" # Save the last section

    # Remove Preamble if it's empty and not intended to be captured
    if "Preamble" in sections and not sections["Preamble"].strip():
        del sections["Preamble"]

    return json.dumps(sections, indent=4) # indent=4 for pretty JSON



In [28]:
try:
    contract_sections = split_contract_sections(res["content"])
    print(contract_sections)
except Exception as e:
    # print traceback 
    print(e)

{
    "Section 1": "Exhibit\u00a0(e)(2)\nCONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT\nAGREEMENT (this \u201cAgreement\u201d) is between The Pep Boys \u2013 Manny, Moe\u00a0& Jack (on its own behalf and on behalf of its subsidiaries and affiliates, \u201cPep Boys\u201d) and Icahn Enterprises L.P. (on its own behalf and on behalf of its subsidiaries and controlled affiliates, \u201cInterested Party\u201d), dated December\u00a08, 2015.\nWHEREAS, on December\u00a07, 2015,\u00a0Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash (as set forth in Pep Boys filing with the Securities and Exchange Commission dated December\u00a07, 2015), which Pep Boys Board of Directors determined on December\u00a07, 2015, in accordance with its obligations under that certain Agreement and Plan of Merger, dated as of October\u00a026, 2015 (the \u201cBridgestone Agreement\u201d), by and among Pep Boys, Bridgestone Retail Operations, LLC, a Delaware limited liability company (\

In [30]:
text_2 = sp._process_docx(Path(r"/home/ajay/contracts_v2/Sample Agreements/Consulting Agreement.docx"))["content"]

In [32]:
print(text_2)

                                                                    EXHIBIT 10.1
                Form of Consultant Agreement for Simon Westbrook
                              CONSULTING AGREEMENT
This Consulting Agreement  ("AGREEMENT") is entered into on September 1, 2011 by
SIMON WESTBROOK ("CONSULTANT"), of 10 Timber Ridge Lane, Scotts Valley, CA 95066
and IN MEDIA CORPORATION.,  a Delaware corporation,  having a mailing address of
4920 El Camino Real,  Suite 100, Los Altos, CA 94022  ("COMPANY"),  based on the
following mutual understanding:
WHEREAS,  the  Consultant  has been  providing  services  since February 1, 2010
without receiving compensation; and
WHEREAS,  the Company and the Consultant wish to memorialize  compensation owed,
an ex-gratia bonus, and memorialize compensation going forward;
THEREFORE,  In consideration of the mutual promises  contained herein and on the
terms and conditions hereinafter set forth, the Consultant and the Company agree
as follows:
ARTICLE 1 -

In [40]:
from ollama import chat

response = chat(
    model="llama3.1",
    messages=[
        {
            "role": "user",
            "content": "From the following agreement, please extract all the sections and subsections." + text_2 + "Do not add any additional information. \n Don not do any Analysis."
        }
    ]
)

print(response["message"]["content"])


Here is a neutral presentation of the provided text:

**Consulting Agreement**

**ARTICLE 1 - PURPOSE AND SCOPE**

The Consultant will provide services to In Media Corporation as its Acting CFO.

**ARTICLE 2 - TERM AND TERMINATION**

The term of this agreement is [not specified].

**ARTICLE 3 - COMPENSATION**

The Consultant shall receive compensation for their services, including options to purchase shares of common stock of the Company.

**ARTICLE 4 - EXPENSES**

The Consultant shall be responsible for maintaining insurance and paying expenses related to their work.

**ARTICLE 5 - INDEMNIFICATION**

The Consultant agrees to indemnify and hold harmless In Media Corporation from any claims, losses, or damages arising out of the performance of this agreement.

**ARTICLE 6 - INSURANCE**

The Consultant must maintain certain types of insurance during the term of this agreement.

**ARTICLE 7 - COMPLETION**

This agreement will be considered complete when all work and services have been sat

In [49]:
from ollama import chat

chunks_size = 3000

for i in range(0, len(text_2), chunks_size):
    response = chat(
        model="llama3.1",
        messages=[
            {
                "role": "user",
                "content": f"""From the following agreement, please extract the content and give a json format" + f{text_2[i:i+chunks_size]} + "Do not add any additional information. \n Don not do any Analysis.
                
                Format:
                {{
                    "section1": "content1",
                    "section2": "content2",
                    ...
                }}
                """
            }
        ]
    )
    print("--------------------------------------------")
    print(response["message"]["content"])

# response = chat(
#     model="llama3.1",
#     messages=[
#         {
#             "role": "user",
#             "content": "From the following agreement, please extract all the sections and subsections." + text_2 + "Do not add any additional information. \n Don not do any Analysis. \n Return the section names and their content. Note that the content should be in the same order as the sections. and the content should not change."
#         }
#     ]
# )

# print(response["message"]["content"])


--------------------------------------------
Here is the extracted content in JSON format:

```
{
    "ARTICLE 1 - DESCRIPTION OF CONSULTING SERVICES": "The nature and scope of the consulting services to be performed hereunder are as set forth in EXHIBIT A, STATEMENT OF WORK, attached hereto.",
    "ARTICLE 2 - TERM OF AGREEMENT": "This Agreement commences on the date first set forth above and expires pursuant to the termination provisions herein.",
    "ARTICLE 3 - OVERSIGHT OF CONSULTING SERVICES AND CONTRACTUAL AUTHORITY": "The Consultant agrees that it will generally provide the specified consulting services through during the term specified herein within the scope of EXHIBIT A, STATEMENT OF WORK.",
    "ARTICLE 4 - COMPENSATION, BILLING AND PAYMENT": "The Company recognizes that, for bona fide services rendered from January 1, 2011 through August 31, 2011, the Consultant has accrued $64,000 in unpaid compensation. In addition, in recognition of the services provided by the Consult

In [43]:
len(text_2)

23111

In [52]:
from ollama import chat

chunks_size = 3000

text_3 = res["content"]

for i in range(0, len(text_3), chunks_size):
    response = chat(
        model="llama3.1",
        messages=[
            {
                "role": "user",
                "content": f"""From the following agreement, please extract the content and give a json format" + f{text_3[i:i+chunks_size]} + "Do not add any additional information. \n Don not do any Analysis.
                
                Format:
                {{
                    "section1": "content1",
                    "section2": "content2",
                    ...
                }}
                """
            }
        ]
    )
    print("--------------------------------------------")
    print(response["message"]["content"])

# response = chat(
#     model="llama3.1",
#     messages=[
#         {
#             "role": "user",
#             "content": "From the following agreement, please extract all the sections and subsections." + text_2 + "Do not add any additional information. \n Don not do any Analysis. \n Return the section names and their content. Note that the content should be in the same order as the sections. and the content should not change."
#         }
#     ]
# )

# print(response["message"]["content"])


--------------------------------------------
Here is the extracted content in JSON format:

```
{
  "Agreement_Name": "CONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT",
  "Parties": [
    {"name": "The Pep Boys – Manny, Moe & Jack", "description": "on its own behalf and on behalf of its subsidiaries and affiliates"},
    {"name": "Icahn Enterprises L.P.", "description": "on its own behalf and on behalf of its subsidiaries and controlled affiliates"}
  ],
  "Date": "December 8, 2015",
  "Whereas_1": "Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash.",
  "Whereas_2": "Pep Boys Board of Directors determined that the proposal would reasonably be expected to result in a Superior Proposal (as defined in the Bridgestone Agreement).",
  "Whereas_3": "In accordance with Section 8.3(b) of the Bridgestone Agreement, Pep Boys is permitted to furnish information to or enter into discussions with Interested Party.",
  "Now_Therefore": {
    "Section_1": "Each party 

In [56]:
from ollama import chat
import json
import re

def process_text_chunks(text, chunk_size=3000):
    """Process text in chunks and get JSON responses from Ollama API"""
    all_responses = []
    
    # Process text in chunks
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        
        response = chat(
            model="llama3.1",
            messages=[
                {
                    "role": "user",
                    "content": f"""From the following agreement, please extract the content and give a json format" + f{chunk} + "Do not add any additional information. \n Don not do any Analysis.
                    
                    Format:
                    {{
                        "section1": "content1",
                        "section2": "content2",
                        ...
                    }}
                    """
                }
            ]
        )
        print("--------------------------------------------")
        print(response["message"]["content"])
        all_responses.append(response["message"]["content"])
    
    return "\n".join(all_responses)

def clean_json_output(text):
    """Clean and process JSON output from the API responses"""
    # Find all JSON content between triple backticks
    json_pattern = r'```\s*\{.*?\}\s*```'
    json_matches = re.finditer(json_pattern, text, re.DOTALL)
    
    all_json_content = []
    for match in json_matches:
        # Extract the JSON string and remove backticks
        json_str = match.group().strip('`').strip()
        try:
            # Parse the JSON
            json_content = json.loads(json_str)
            all_json_content.append(json_content)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            continue
    
    # Merge all JSON content
    final_json = {}
    for content in all_json_content:
        # Handle nested dictionaries and lists properly
        for key, value in content.items():
            if key in final_json:
                if isinstance(final_json[key], dict) and isinstance(value, dict):
                    final_json[key].update(value)
                elif isinstance(final_json[key], list) and isinstance(value, list):
                    final_json[key].extend(value)
                else:
                    final_json[key] = value
            else:
                final_json[key] = value
    
    return final_json

def process_agreement(text):
    """Main function to process agreement text and return final JSON"""
    # Step 1: Process text in chunks and get API responses
    api_responses = process_text_chunks(text)
    
    # Step 2: Clean and combine JSON from responses
    final_json = clean_json_output(api_responses)
    
    return final_json

# Usage example
if __name__ == "__main__":
    # Your input text
    text_3 = res["content"]  # Replace with your actual input text
    
    # Process the agreement
    final_result = process_agreement(text_3)
    
    # Print the final JSON in a nicely formatted way
    print("\nFinal Processed JSON:")
    print(json.dumps(final_result, indent=2))

--------------------------------------------
Here is the extracted content in JSON format:

```
{
  "AGREEMENT": "This Agreement is between The Pep Boys – Manny, Moe & Jack and Icahn Enterprises L.P.",
  "WHEREAS": [
    "on December 7, 2015, Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash",
    "Pep Boys Board of Directors determined that the proposal would reasonably be expected to result in a Superior Proposal",
    "in accordance with Section 8.3(b) of the Bridgestone Agreement, Pep Boys is permitted to furnish information to or enter into discussions with Interested Party"
  ],
  "NOW": {
    "THEREFORE": "for and in consideration of the promises and mutual obligations contained herein, the parties agree as follows",
    "ARTICLE1": [
      "Each party on its own behalf understands that they will receive certain Confidential Information from the other party",
      "Confidential Information includes any of the Disclosing Party's trade secrets,

In [None]:
print(json.dumps(final_result, indent=2))

In [None]:
from ollama import chat
import json
import re
from collections import defaultdict

def process_text_chunks(text, chunk_size=3000):
    """Process text in chunks and get JSON responses from Ollama API"""
    all_responses = []
    
    # Process text in chunks
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        
        response = chat(
            model="llama3.1",
            messages=[
                {
                    "role": "user",
                    "content": f"""From the following agreement, please extract the content and give a json format" + f{chunk} + "Do not add any additional information. \n Don not do any Analysis.
                    
                    Format:
                    {{
                        "section1": "content1",
                        "section2": "content2",
                        ...
                    }}
                    """
                }
            ]
        )
        print("--------------------------------------------")
        print(response["message"]["content"])
        all_responses.append(response["message"]["content"])
    
    return "\n".join(all_responses)

def clean_json_output(text):
    """Clean and process JSON output from the API responses"""
    # Find all JSON content between triple backticks
    json_pattern = r'```\s*\{.*?\}\s*```'
    json_matches = re.finditer(json_pattern, text, re.DOTALL)
    
    # Use defaultdict to handle repeated sections
    section_counter = defaultdict(int)
    final_json = {}
    
    for match in json_matches:
        try:
            # Extract and parse JSON
            json_str = match.group().strip('`').strip()
            json_content = json.loads(json_str)
            
            # Process each key-value pair
            for key, value in json_content.items():
                # Handle different types of values
                if isinstance(value, dict):
                    # For nested dictionaries
                    if key not in final_json:
                        final_json[key] = {}
                    for sub_key, sub_value in value.items():
                        section_counter[f"{key}_{sub_key}"] += 1
                        count = section_counter[f"{key}_{sub_key}"]
                        new_sub_key = f"{sub_key}_{count}" if count > 1 else sub_key
                        final_json[key][new_sub_key] = sub_value
                
                elif isinstance(value, list):
                    # For lists, extend existing list or create new one
                    if key not in final_json:
                        final_json[key] = []
                    # Remove duplicates while preserving order
                    new_items = [item for item in value if item not in final_json[key]]
                    final_json[key].extend(new_items)
                
                else:
                    # For simple values
                    section_counter[key] += 1
                    count = section_counter[key]
                    new_key = f"{key}_{count}" if count > 1 else key
                    final_json[new_key] = value
                    
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            continue
    
    return final_json

def reorganize_sections(json_data):
    """Reorganize sections to maintain proper order and structure"""
    organized = {}
    
    # Sort sections that should be grouped together
    section_groups = defaultdict(list)
    
    for key, value in json_data.items():
        # Extract base section name (remove _1, _2, etc.)
        base_name = re.sub(r'_\d+$', '', key)
        section_groups[base_name].append((key, value))
    
    # Process each group
    for base_name, sections in section_groups.items():
        if len(sections) == 1:
            # If only one section, use original name
            organized[sections[0][0]] = sections[0][1]
        else:
            # If multiple sections, create a numbered list
            if isinstance(sections[0][1], dict):
                # For nested dictionaries
                organized[base_name] = {}
                for key, value in sections:
                    organized[base_name].update(value)
            else:
                # For regular sections
                organized[base_name] = [value for _, value in sorted(sections)]
    
    return organized

def process_agreement(text):
    """Main function to process agreement text and return final JSON"""
    # Step 1: Process text in chunks and get API responses
    api_responses = process_text_chunks(text)
    
    # Step 2: Clean and combine JSON from responses
    merged_json = clean_json_output(api_responses)
    
    # Step 3: Reorganize sections
    final_json = reorganize_sections(merged_json)
    
    return final_json



--------------------------------------------
Here is the extracted content in JSON format:

```
{
    "AGREEMENT TITLE": "CONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT",
    "DATE": "December 8, 2015",
    "PARTIES": {
        "Pep Boys - Manny, Moe & Jack": {
            "capacity": "on its own behalf and on behalf of its subsidiaries and affiliates"
        },
        "Icahn Enterprises L.P.": {
            "capacity": "on its own behalf and on behalf of its subsidiaries and controlled affiliates"
        }
    },
    "WHEREAS CLAUSES": [
        "Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash",
        "Pep Boys Board of Directors determined that the proposal would reasonably be expected to result in a Superior Proposal",
        "Pep Boys is permitted to furnish information to or enter into discussions with Interested Party, subject to compliance with Section 8.3(b) of the Bridgestone Agreement"
    ],
    "AGREEMENT TERMS": {
        "1. Confi

In [63]:
# Usage example

text_3 = res["content"]  # Replace with your actual input text

# Process the agreement
final_result = process_agreement(text_3)

# Print the final JSON in a nicely formatted way
print("\nFinal Processed JSON:")
print(json.dumps(final_result, indent=2))

--------------------------------------------
Here is the extracted content in JSON format:

```
{
  "confidentialityAndNonDisclosureAgreement": "CONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT",
  "agreementDescription": "AGREEMENT (this “Agreement”) is between The Pep Boys – Manny, Moe & Jack (on its own behalf and on behalf of its subsidiaries and affiliates, “Pep Boys”) and Icahn Enterprises L.P. (on its own behalf and on behalf of its subsidiaries and controlled affiliates, “Interested Party”), dated December 8, 2015.",
  "whereas1": "WHEREAS, on December 7, 2015, Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash (as set forth in Pep Boys filing with the Securities and Exchange Commission dated December 7, 2015), which Pep Boys Board of Directors determined on December 7, 2015, in accordance with its obligations under that certain Agreement and Plan of Merger, dated as of October 26, 2015 (the “Bridgestone Agreement”), by and among Pep Boys, Bridges

In [64]:
print(json.dumps(final_result, indent=2))

{
  "confidentialityAndNonDisclosureAgreement": "CONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT",
  "agreementDescription": "AGREEMENT (this \u201cAgreement\u201d) is between The Pep Boys \u2013 Manny, Moe\u00a0& Jack (on its own behalf and on behalf of its subsidiaries and affiliates, \u201cPep Boys\u201d) and Icahn Enterprises L.P. (on its own behalf and on behalf of its subsidiaries and controlled affiliates, \u201cInterested Party\u201d), dated December\u00a08, 2015.",
  "whereas1": "WHEREAS, on December\u00a07, 2015, Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash (as set forth in Pep Boys filing with the Securities and Exchange Commission dated December\u00a07, 2015), which Pep Boys Board of Directors determined on December\u00a07, 2015, in accordance with its obligations under that certain Agreement and Plan of Merger, dated as of October\u00a026, 2015 (the \u201cBridgestone Agreement\u201d), by and among Pep Boys, Bridgestone Retail Operation

In [65]:
# Usage example


# Process the agreement
final_result = process_agreement(text_2)

# Print the final JSON in a nicely formatted way
print("\nFinal Processed JSON:")
print(json.dumps(final_result, indent=2))

--------------------------------------------
Here is the extracted content in JSON format:

```
{
    "AGREEMENT": "This Consulting Agreement (\"AGREEMENT\") is entered into on September 1, 2011 by SIMON WESTBROOK (\"CONSULTANT\"), of 10 Timber Ridge Lane, Scotts Valley, CA 95066 and IN MEDIA CORPORATION., a Delaware corporation, having a mailing address of 4920 El Camino Real, Suite 100, Los Altos, CA 94022 (\"COMPANY\")",
    "WHEREAS": "The Consultant has been providing services since February 1, 2010 without receiving compensation; and The Company and the Consultant wish to memorialize compensation owed, an ex-gratia bonus, and memorialize compensation going forward;",
    "ARTICLE 1 - DESCRIPTION OF CONSULTING SERVICES": "The nature and scope of the consulting services to be performed hereunder are as set forth in EXHIBIT A, STATEMENT OF WORK, attached hereto.",
    "ARTICLE 2 - TERM OF AGREEMENT": "This Agreement commences on the date first set forth above and expires pursuant to

In [66]:
print(json.dumps(final_result, indent=2))

{
  "AGREEMENT": "This Consulting Agreement (\"AGREEMENT\") is entered into on September 1, 2011 by SIMON WESTBROOK (\"CONSULTANT\"), of 10 Timber Ridge Lane, Scotts Valley, CA 95066 and IN MEDIA CORPORATION., a Delaware corporation, having a mailing address of 4920 El Camino Real, Suite 100, Los Altos, CA 94022 (\"COMPANY\")",
  "WHEREAS": "The Consultant has been providing services since February 1, 2010 without receiving compensation; and The Company and the Consultant wish to memorialize compensation owed, an ex-gratia bonus, and memorialize compensation going forward;",
  "ARTICLE 1 - DESCRIPTION OF CONSULTING SERVICES": "The nature and scope of the consulting services to be performed hereunder are as set forth in EXHIBIT A, STATEMENT OF WORK, attached hereto.",
  "ARTICLE 2 - TERM OF AGREEMENT": "This Agreement commences on the date first set forth above and expires pursuant to the termination provisions herein.",
  "ARTICLE 3 - OVERSIGHT OF CONSULTING SERVICES AND CONTRACTUAL AU

In [67]:
# Usage example

with open(r"/home/ajay/contracts_v2/backend/processed_files/tmpou5vb1sj.txt", "r") as file:
    text_2 = file.read()

# Process the agreement
final_result = process_agreement(text_2)

# Print the final JSON in a nicely formatted way
print("\nFinal Processed JSON:")
print(json.dumps(final_result, indent=2))

--------------------------------------------
Here is the extracted content in JSON format:

```
{
  "Agreement Type": "CONFIDENTIAL EXECUTION DOCUMENT: MDC\757175_1 SERVICE AGREEMENT OCTOBER 2015",
  "Parties Involved":
  [
    {
      "Name": "ABCDEF MANUFACTURING PJSC",
      "Type": "Private Joint Stock Company",
      "Location": "Emirate of Abu Dhabi, United Arab Emirates"
    },
    {
      "Name": "XYZ LIMITED",
      "Type": "Company",
      "Location": "India, Bangalore-560 059"
    }
  ],
  "Date": "3O October 2015",
  "Whereas":
  [
    {
      "Clause A": "The Contractor is engaged in the business of providing information technology services and has considerable skill, knowledge and experience in that field to perform such services;"
    },
    {
      "Clause B": "In reliance upon that skill, knowledge and experience, the Company has agreed to engage the Contractor to provide the Services (as defined below) to the Company or any other person or entity designated by the Com

In [69]:
print(json.dumps(final_result, indent=2))

{
  "definitions": {
    "intellectual property rights": "any formation (whether registered or unregistered), applications for registration, and the right to apply for registration, for any of these rights; and all other intellectual property rights and equivalent or similar forms of protection existing anywhere in the world",
    "mubadala": "Mubadala Development Company PJSC",
    "project materials": "any works and materials created, developed, written or prepared by the Contractor, its employees, agents or subcontractors in relation to, or as part of, the performance of the Services (whether individually, collectively or jointly with the Company and on whatever media) including the Deliverables and any computer software programs, reports, studies, data, databases, diagrams, charts, specifications, pre-contractual and contractual documents and all drafts thereof and working papers relating thereto, but excluding the Contractor's ordinary correspondence, know-how, methodology and too

In [70]:
final_result

{'definitions': {'intellectual property rights': 'any formation (whether registered or unregistered), applications for registration, and the right to apply for registration, for any of these rights; and all other intellectual property rights and equivalent or similar forms of protection existing anywhere in the world',
  'mubadala': 'Mubadala Development Company PJSC',
  'project materials': "any works and materials created, developed, written or prepared by the Contractor, its employees, agents or subcontractors in relation to, or as part of, the performance of the Services (whether individually, collectively or jointly with the Company and on whatever media) including the Deliverables and any computer software programs, reports, studies, data, databases, diagrams, charts, specifications, pre-contractual and contractual documents and all drafts thereof and working papers relating thereto, but excluding the Contractor's ordinary correspondence, know-how, methodology and tools",
  'serv

In [None]:
import chromadb
import tiktoken
from sentence_transformers import SentenceTransformer
from typing import List, Optional, Dict, Any
import logging
from functools import lru_cache
import os
import re
from chromadb.utils import embedding_functions
from backend.contract_analyzer.config import Config
from backend.Doc_Processor.processors.text_pre_processor import process_agreement

logger = logging.getLogger(__name__)

import re

    
class VectorDB:
    """Core vector database operations"""

    def __init__(self):
        """Initialize database components"""
        self.active_collection = None
        self._init_components()
        self.logger = logging.getLogger(__name__)

    def _init_components(self):
        """Initialize required database components"""
        try:
            db_path = str(Config.CHROMA_DB_PATH)
            os.makedirs(db_path, exist_ok=True)
            
            self.client = chromadb.PersistentClient(path=db_path)
            self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
            
        except Exception as e:
            self.logger.error(f"VectorDB initialization failed: {str(e)}")
            raise

    @lru_cache(maxsize=100)
    def _compute_embedding(self, text: str) -> List[float]:
        """
        Compute embedding for text with caching
        
        Args:
            text: Text to embed
            
        Returns:
            List of embedding values
        """
        return self.model.encode([text], normalize_embeddings=True).tolist()[0]


    def create_collection(self, collection_name: str) -> bool:
        """
        Create a new collection
        
        Args:
            collection_name: Name for the collection
            
        Returns:
            Success status
        """
        try:
            safe_name = self._sanitize_collection_name(collection_name)
            # print("creating collection ", safe_name)
            if not self._collection_exists(safe_name):
                logging.info(f"Creating new collection: {safe_name}")
                self.active_collection = self.client.create_collection(
                    name=safe_name,
                    embedding_function= self.embedding_fn,
                    metadata={"name": safe_name}
                    )
                logging.info(f"Created new collection: {safe_name}")
                self.logger.info(f"Created new collection: {safe_name}")
            else:
                self.active_collection = self.client.get_collection(name=safe_name)
                self.logger.info(f"Using existing collection: {safe_name}")
            return True
            
        except Exception as e:
            self.logger.error(f"Collection creation failed: {str(e)}")
            return False

    def set_active_collection(self, collection_name: str) -> bool:
        """
        Set the active collection for operations
        
        Args:
            collection_name: Name of collection to activate
            
        Returns:
            Success status
        """
        try:
            safe_name = self._sanitize_collection_name(collection_name)
            if not self._collection_exists(safe_name):
                self.logger.error(f"Collection not found: {safe_name}")
                return False
                
            self.active_collection = self.client.get_collection(
                name=safe_name,
                embedding_function= self.embedding_fn
                )
            self.logger.info(f"Set active collection to: {safe_name}")
            return True
            
        except Exception as e:
            self.logger.error(f"Failed to set active collection: {str(e)}")
            return False

    
    def add_documents(
        self, 
        texts: str,
    ) -> bool:
        """
        Add documents to the active collection
        
        Args:
            docs: List of documents to add
            metadatas: Optional metadata for each document
            
        Returns:
            Success status
        """
        if not self.active_collection:
            print("********No active collection")
            self.logger.error("No active collection")
            return False
            
        try:
            # creating documents
            
            docs = process_agreement(texts)
        
            # create ids
            
            
            print(f"********Adding {len(docs)} documents to collection")
            
            documents = ["content: " + key + " \n " + str(value) for key, value in final_result.items()]
            # adding documents to collection
            self.active_collection.add(
                ids = list[docs.keys()],
                documents=documents,
            )
            
            self.logger.info(f"Added {len(docs)} documents to collection")
            
            print("********Documents added")
            
            return True
            
        except Exception as e:
            self.logger.error(f"Document addition failed: {str(e)}")
            return False

    def get_documents(
        self, 
        ids: Optional[List[str]] = None
    ) -> Optional[Dict[str, List]]:
        """
        Get documents from active collection
        
        Args:
            ids: Optional list of document IDs to retrieve
            
        Returns:
            Dictionary containing documents and metadata
        """
        if not self.active_collection:
            print("********No active collection while getting documents")
            self.logger.error("No active collection")
            return None
            
        try:
            return self.active_collection.get(ids=ids)
        except Exception as e:
            self.logger.error(f"Document retrieval failed: {str(e)}")
            return None

    def get_context(
        self, 
        query: str, 
        num_results: int = 3
    ) -> Optional[str]:
        """
        Get relevant context for a query
        
        Args:
            query: Search query
            num_results: Number of results to return
            
        Returns:
            Combined context string
        """
        
        if not self.active_collection:
            print("********No active collection while getting context")
            self.logger.error("No active collection")
            return None
            
        try:
            
            results = self.active_collection.query(
                query_texts=[query],
                n_results=num_results,
            )
            
            if not results['documents'] or not results['documents'][0]:
                return None
            
            chunks = results['documents'][0]
            metadatas = results['metadatas'][0]
            
            sorted_results = sorted(
                zip(chunks, metadatas))
            
            return "\n...\n".join(chunk for chunk, _ in sorted_results)
            
        except Exception as e:
            self.logger.error(f"Context retrieval failed: {str(e)}")
            return None

    def delete_collection(self, collection_name: str) -> bool:
        """
        Delete a collection
        
        Args:
            collection_name: Name of collection to delete
            
        Returns:
            Success status
        """
        print("*********Deleting collection")
        try:
            safe_name = self._sanitize_collection_name(collection_name)
            if not self._collection_exists(safe_name):
                self.logger.warning(f"Collection not found: {safe_name}")
                return False
                
            self.client.delete_collection(name=safe_name)
            if self.active_collection and self.active_collection.name == safe_name:
                self.active_collection = None
                
            self.logger.info(f"Deleted collection: {safe_name}")
            return True
            
        except Exception as e:
            self.logger.error(f"Collection deletion failed: {str(e)}")
            return False

    def _collection_exists(self, collection_name: str) -> bool:
        """Check if a collection exists"""
        # print("*********Checking if collection exists")
        # print(collection_name in self.client.list_collections())
        return collection_name in self.client.list_collections()

    def _sanitize_collection_name(self, name: str) -> str:
        """Sanitize collection name for database use"""
        return "".join(c if c.isalnum() else "_" for c in name)

    def _prepare_batch_metadata(
        self,
        batch_start: int,
        batch_size: int,
        token_counts: List[int],
        timestamp: str,
        total_chunks: int,
        metadatas: Optional[List[Dict[str, Any]]] = None
    ) -> List[Dict[str, Any]]:
        """Prepare metadata for batch processing"""
        if metadatas:
            return [{
                **metadatas[batch_start//batch_size].copy(),
                'tokens': count,
                'timestamp': timestamp,
                'chunk_index': batch_start + j,
                'total_chunks': total_chunks
            } for j, count in enumerate(token_counts)]
        else:
            return [{
                'tokens': count,
                'timestamp': timestamp,
                'chunk_index': batch_start + j,
                'total_chunks': total_chunks
            } for j, count in enumerate(token_counts)]

    def cleanup(self):
        """Cleanup database resources"""
        try:
            print("Cleaning up database")
            self.active_collection = None
            self._compute_embedding.cache_clear()
            self.logger.info("Database cleanup completed")
        except Exception as e:
            self.logger.error(f"Cleanup failed: {str(e)}")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(r"/home/eyhyd/contracts_v2/msa_28.txt", "r") as file:
    text = file.read()

In [81]:
from pathlib import Path

def create_collection_name(file_path: Path) -> str:
    # Create a collection name based on the file name and size
    collection_name = file_path.stem
    collection_name = collection_name.replace("-", "_").lower().replace(" ", "_")
    collection_name = f"{collection_name}_{file_path.stat().st_size}"
    
    return collection_name

In [18]:
vb = VectorDB()

collection_name = create_collection_name(Path(r"/home/eyhyd/contracts_v2/msa_28.txt"))

vb.delete_collection(collection_name)

vb.create_collection(collection_name)

vb.add_documents(text)




*********Deleting collection
--------------------------------------------
Here is the content extracted from the agreement in JSON format:

```
{
  "Iowa State University Information": {
    "ISU UNIT NAME AND ADDRESS": "Ames, Iowa 50011",
    "Effective Date": "date on which the last party signs this Agreement"
  },
  "Scope and Performance of Services": {
    "Customer Need for ISU Services": "Customer has expressed a need to use the services of ISU from time to time on various projects",
    "Individual Project Agreement (IPA)": {
      "Purpose": "Each IPA shall be substantially in the form of Attachment A",
      "Part of and Incorporated into this Agreement": "Each IPA shall be deemed part of and incorporated into this Agreement"
    },
    "ISU Responsibilities": {
      "Performance of Services": "ISU shall perform the services described in the IPA (the “Services”)",
      "Use of Personnel, Facilities, Equipment, Materials and Supplies": "Unless stated otherwise in the IPA, IS

ERROR:__main__:Document addition failed: Expected document to be a str, got {'Iowa State University Information': {'ISU UNIT NAME AND ADDRESS': 'Ames, Iowa 50011', 'Effective Date': 'date on which the last party signs this Agreement'}, 'Scope and Performance of Services': {'Customer Need for ISU Services': 'Customer has expressed a need to use the services of ISU from time to time on various projects', 'Individual Project Agreement (IPA)': {'Purpose': 'Each IPA shall be substantially in the form of Attachment A', 'Part of and Incorporated into this Agreement': 'Each IPA shall be deemed part of and incorporated into this Agreement'}, 'ISU Responsibilities': {'Performance of Services': 'ISU shall perform the services described in the IPA (the “Services”)', 'Use of Personnel, Facilities, Equipment, Materials and Supplies': 'Unless stated otherwise in the IPA, ISU shall provide any personnel, facilities, equipment, materials and supplies required for the Services to be completed', 'Notific

--------------------------------------------
Here is the content extracted from the agreement in JSON format:

```
{
  "general_provisions": "This Agreement may not be assigned or transferred by either party without the prior written consent of the other party, which shall not be unreasonably withheld.",
  "waiver": "+ fa waiver to subsequently enforce such term or condition",
  "invalidity": "The invalidity or illegality of one or more provisions of this Agreement shall not affect the enforceability of the remaining provisions.",
  "survival": "The parties’ rights and obligations in this Agreement that, by their nature, would continue beyond the termination of this Agreement shall survive such termination.",
  "governance_laws": "This Agreement shall be construed in accordance with the laws of the State of Iowa, without giving effect to its conflicts of law provisions",
  "jurisdiction": "any litigation or actions commenced in connection with this Agreement shall be instituted in a co

False

In [4]:
final_result = process_agreement(text)

# Print the final JSON in a nicely formatted way
print("\nFinal Processed JSON:")
print(json.dumps(final_result, indent=2))

--------------------------------------------
Here is the extracted content in JSON format:

```
{
  "effective_date": "the date on which the last party signs this Agreement ("Effective Date")",
  "scope_and_performance_of_services": {
    "customer_need": "Customer has expressed a need to use the services of ISU from time to time on various projects.",
    "individual_project_agreement": "Customer and ISU shall enter into an Individual Project Agreement (\"IPA\") for each project for which Customer desires ISU to provide services.",
    "ipa_terms": {
      "services": "ISU shall perform the services described in the IPA",
      "costs_and_timeframe": "Unless stated otherwise in the IPA, ISU shall use reasonable efforts to perform the Services requested within the projected costs and time period indicated in the IPA."
    },
    "notification_of_exceeding_costs_or_timeframe": "In the event ISU’s costs exceed the projected price or in the event the Services cannot be performed within th

NameError: name 'json' is not defined

In [9]:
import json

In [10]:
print(json.dumps(final_result, indent=2))

{
  "Introduction": "This Master Service Agreement (\u201cAgreement\u201d) is entered into by Iowa State University of Science and Technology, on behalf of its <ISU UNIT NAME AND ADDRESS>, Ames, Iowa 50011 (\u201cISU\u201d), and <CUSTOMER NAME> (\u201cCustomer\u201d), <CUSTOMER ADDRESS>. The effective date of this Agreement shall be the date on which the last party signs this Agreement (\u201cEffective Date\u201d).",
  "Scope and Performance of Services": "Customer has expressed a need to use the services of ISU from time to time on various projects. Customer and ISU shall enter into an Individual Project Agreement (\u201cIPA\u201d) for each project for which Customer desires ISU to provide services.",
  "Customer Materials": {
    "Identification in IPA": "If Customer is to furnish ISU with materials to be tested or used while performing the Services (\u201cMaterials\u201d), such Materials shall be identified in the IPA.",
    "Authorization and Disclosure": "Customer represents and w

In [13]:
type(final_result)

dict

In [14]:
for key, value in final_result.items():
    print(key, value)

Introduction This Master Service Agreement (“Agreement”) is entered into by Iowa State University of Science and Technology, on behalf of its <ISU UNIT NAME AND ADDRESS>, Ames, Iowa 50011 (“ISU”), and <CUSTOMER NAME> (“Customer”), <CUSTOMER ADDRESS>. The effective date of this Agreement shall be the date on which the last party signs this Agreement (“Effective Date”).
Scope and Performance of Services Customer has expressed a need to use the services of ISU from time to time on various projects. Customer and ISU shall enter into an Individual Project Agreement (“IPA”) for each project for which Customer desires ISU to provide services.
Customer Materials {'Identification in IPA': 'If Customer is to furnish ISU with materials to be tested or used while performing the Services (“Materials”), such Materials shall be identified in the IPA.', 'Authorization and Disclosure': 'Customer represents and warrants that it is authorized to retain ISU to perform the Services using the Materials. Cus

In [5]:
import chromadb
from chromadb.utils import embedding_functions
client = chromadb.PersistentClient(path=r"/home/eyhyd/contracts_v2/backend/chroma_db")
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)

In [7]:
collection_name = create_collection_name(Path(r"/home/eyhyd/contracts_v2/msa_28.txt"))

In [8]:
client.create_collection(name=collection_name, embedding_function=embedding_fn)

Collection(name=msa_28_13990)

In [9]:
new_collection = client.get_collection(name=collection_name)

In [10]:
final_result

{'Section 1: Obligations of ISU and Customer': 'ISU will fulfill its contractual obligations. Unless stated otherwise, the Customer assumes all risk of loss for transportation and shall be responsible for delivery costs.',
 'Section 2: Ownership of Materials': 'The Customer retains ownership of the Materials at all times.',
 'Section 3: Results': 'Customer is the sole owner of all deliverables generated as part of the Services. ISU may retain a copy to evidence fulfillment of obligations.',
 'Section 4: No Warranty': 'ISU does not make any warranties, express or implied, regarding the Services or Results. Customer should determine independently whether the Results are suitable for their use.',
 'Section 5: Indemnification': "Customer shall indemnify and hold harmless ISU, the State of Iowa, and the Board of Regents from any claims, damages, fines, losses or expenses arising from or in connection with Customer's use of the Results.",
 'Section 6: Inventions': 'If an invention is develop

In [11]:
docs = ["content: " + key + " \n " + str(value) for key, value in final_result.items()]

new_collection.add(ids=list(final_result.keys()), documents=docs)

/home/eyhyd/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:44<00:00, 1.87MiB/s]   


In [15]:
new_collection.query(
                query_texts=["Obligations"],
                n_results=3,
            )

{'ids': [['Section 1: Obligations of ISU and Customer',
   'assignmentProvision',
   'Non-Payment Obligations']],
 'embeddings': None,
 'documents': [['content: Section 1: Obligations of ISU and Customer \n ISU will fulfill its contractual obligations. Unless stated otherwise, the Customer assumes all risk of loss for transportation and shall be responsible for delivery costs.',
   'content: assignmentProvision \n This Agreement may not be assigned or transferred by either party without the prior written consent of the other party, which shall not be unreasonably withheld.',
   'content: Non-Payment Obligations \n In the event of termination, Customer shall pay ISU for Services provided and non-cancelable obligations incurred prior to the termination date and ISU shall provide Customer with Results completed as of the termination date for which payment has been received.']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[1.0396939705240094, 1.14530368

In [91]:
from backend.contract_analyzer.agents.template.extract_information import ExtractionProcessor


ext_proc = ExtractionProcessor()

In [18]:
from phi.agent import Agent

from phi.model.ollama import Ollama


ag = Agent(
    model=Ollama(
        id="llama3.1",
        config={
            "temperature": 0.9,
        },
    )
)

In [19]:
ext_proc.process_extractions(text, new_collection, ag)

Extracting Information: 66it [05:58,  5.43s/it]


In [21]:
ext_proc.export_results(format="dataframe")

Unnamed: 0,term,extracted_value,section,timestamp
0,Agreement Category,Not specified,"None found for ""State of Iowa tax refunds, lot...",2025-02-17T09:08:49.944525
1,Template used,Not specified,Not found,2025-02-17T09:09:04.586045
2,Nature of Agreement,12%,9,2025-02-17T09:09:38.684234
3,Document Type,Not specified,"Attachment A, 6.",2025-02-17T09:09:45.661644
4,Document Type Comment,Not specified,16. Counterparts; Authorization (This Agreemen...,2025-02-17T09:09:49.999241
...,...,...,...,...
61,Any ESG Or CSR Obligation,Not specified,10.1 (Payment Terms),2025-02-17T09:14:08.375686
62,Change of Control Provision,"Not specified (the same as above, only this on...",6.8,2025-02-17T09:14:12.596043
63,Governing Law,Three years after the Effective Date (12.1),12.1,2025-02-17T09:14:18.270646
64,Dispute resolution,Not specified,Not found,2025-02-17T09:14:22.081606


In [53]:
new_collection.query(
    query_texts =["Governing Law"],
    n_results = 3
)

example = new_collection.query(
    query_texts =["Governing Law"],
    n_results = 3
)

In [99]:
new_collection.query(
    query_texts =["Agreement Category"],
    n_results = 3
)


{'ids': [['Section 5', 'agreement', 'Section 3']],
 'embeddings': None,
 'documents': [['content: Section 5 \n This Agreement may be executed in counterparts and by facsimile.',
   'content: agreement \n The parties agree to consult and reasonably cooperate with each other regarding the timing, manner, and contents of any disclosure.',
   'content: Section 3 \n This Agreement constitutes the entire understanding between the parties relative to the protection of information which may be exchanged pursuant to this Agreement in connection with the potential transaction contemplated hereunder and supersedes any and all prior agreements or understandings between the parties regarding such subject matter. The term of this Agreement (except where otherwise stated as a shorter term) shall be from the date of this Agreement until July\xa031, 2017.']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[0.9488477655916734, 1.0521416548518063, 1.0867126468764563]],
 

In [50]:
import pandas as pd

terms_df = pd.read_excel(r"/home/eyhyd/contracts_v2/Rules/Contract Abstraction Fields.xlsx", header=None)

terms_df.rename(columns={0: "Terms"}, inplace=True)

list(terms_df["Terms"])


['Contract Name',
 'Agreement Type',
 'Country of agreement',
 'Contract Details',
 ' Entity Name',
 'Counterparty Name',
 'Summary',
 'Department of Contract Owner',
 'SPOC',
 'Agreement Group',
 'Family Agreement',
 'Family Documents Present',
 'Family Hierarchy',
 'Scanned',
 ' Signature by:',
 'Effective Date',
 'Contract Start Date',
 'Contract Duration',
 'Contract End Date',
 'Contingent Contract',
 'Perpetual Contract',
 'SLA',
 'Stamping Date',
 'Franking Date',
 'Franking Date_Availablity',
 'Governing Law',
 'Dispute Resolution',
 'Place of Courts',
 'Court Jurisdiction',
 'Place of Arbitration',
 'Arbitration Institution',
 'Number of Arbitrators',
 'Seat of Arbitration',
 'Venue of Arbitration',
 'Legal Action Rights with counterparty',
 'Counterparty - liability cap',
 ' - liability cap',
 'Counterparty - liability limitation summary',
 ' - liability limitation summary',
 'Indemnification',
 'Indemnification Summary',
 'Counterparty - liquidated damages',
 ' - liquidated 

In [70]:
from ollama import chat

terms = list(terms_df["Terms"])

for term in terms[:5]:

    text = new_collection.query(query_texts=[term], n_results=3)

    response = chat(
        model="llama3.1",
        messages=[
            {
                "role": "user",
                "content": f"""From the following text {text} \n Extract the exact {term} and the section it is found. 
                Note that ids contains the list of Section from where the text belongs.
            
                
        Requirements:
        - Extract the specific value
        - Include the exact section/paragraph where the value was found 
        - Return "Not specified" if not found
        - No explanations or analysis
        
        Format: 
        Value: <extracted_value>
        Section: <relevant_section>
        
        Example Output:
        Governing Law: laws of the State of Iowa,
        Section: governingLaw
                
                """,
            }
        ],
    )

    print(f"_______________{term}_____________")
    print(response["message"]["content"])

_______________Contract Name_____________
Here are the extracted values with their relevant sections:

1. Master Service Agreement
Section: individualProjectAgreement

2. Confidential Information
Section: return_request

3. Confidential Information
Section: retention

4. Confidential Information
Section: ownership

5. Confidential Information
Section: survival

6. 
Section: Not specified

7. 
Section: Not specified

8. 
Section: Not specified
_______________Agreement Type_____________
Here is the extracted information:

Value: Miscellaneous
Section: Miscellaneous

Value: counterpartsAndAuthorization
Section: counterpartsAndAuthorization

Value: Confidentiality
Section: Confidentiality
_______________Country of agreement_____________
Value: State of Iowa
Section: governingLaw
_______________Contract Details_____________
Here is the extracted data:

1. Return or Destruction; Ownership; Survival
Section: Return or Destruction; Ownership; Survival

2. Upon the provider’s written request, t

In [77]:
from docx import Document
from pathlib import Path
from typing import Dict , Any


class StructuredProcessor():
    SUPPORTED_FORMATS = {
        '.json': 'json',
        '.xml': 'xml',
        '.xlsx': 'excel',
        '.xls': 'excel',
        '.md': 'markdown',
        '.txt': 'text',
        '.docx': 'docx',  # Added support for .docx
        '.doc': 'docx'  # Added support for .doc    
    }
    
    
    
    def _process_docx(self, file_path: Path) -> Dict[str, Any]:
        """Process DOCX files."""
        doc = Document(file_path)
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
        word_count = sum(len(para.split()) for para in paragraphs)
        
        return {
            'content': '\n'.join(paragraphs),
            'metadata': {
                'format': 'docx',
                'size': file_path.stat().st_size,
                'paragraphs': len(paragraphs),
                'word_count': word_count
            }
        }


sp = StructuredProcessor()

res = sp._process_docx(Path(r"/home/eyhyd/contracts_v2/Sample Agreements/Confidentiality and Non-Disclosure Agreement.docx"))

In [82]:
collection_name = create_collection_name(Path(r"/home/eyhyd/contracts_v2/Sample Agreements/Confidentiality and Non-Disclosure Agreement.docx"))

In [83]:
collection_name

'confidentiality_and_non_disclosure_agreement_11627'

In [84]:
new_collection = client.create_collection(name=collection_name, embedding_function=embedding_fn)

new_collection

Collection(name=confidentiality_and_non_disclosure_agreement_11627)

In [85]:
text = res["content"]

In [86]:
import json

final_result = process_agreement(text)

# Print the final JSON in a nicely formatted way
print("\nFinal Processed JSON:")
print(json.dumps(final_result, indent=2))

--------------------------------------------
Here is the extracted content in JSON format:

```
{
    "AGREEMENT": "This Agreement is between The Pep Boys – Manny, Moe & Jack (on its own behalf and on behalf of its subsidiaries and affiliates, “Pep Boys”) and Icahn Enterprises L.P. (on its own behalf and on behalf of its subsidiaries and controlled affiliates, “Interested Party”), dated December 8, 2015.",
    "WHEREAS1": "On December 7, 2015, Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash (as set forth in Pep Boys filing with the Securities and Exchange Commission dated December 7, 2015), which Pep Boys Board of Directors determined on December 7, 2015, in accordance with its obligations under that certain Agreement and Plan of Merger, dated as of October 26, 2015 (the “Bridgestone Agreement”), by and among Pep Boys, Bridgestone Retail Operations, LLC, a Delaware limited liability company (“Parent”), TAJ Acquisition Co., a Pennsylvania corporatio

In [87]:
docs = ["content: " + key + " \n " + str(value) for key, value in final_result.items()]

new_collection.add(ids=list(final_result.keys()), documents=docs)

In [98]:
new_collection.query(
    query_texts =["Nature of Agreement"],
    n_results = 3
)


{'ids': [['agreement', 'Section 5', 'Section 3']],
 'embeddings': None,
 'documents': [['content: agreement \n The parties agree to consult and reasonably cooperate with each other regarding the timing, manner, and contents of any disclosure.',
   'content: Section 5 \n This Agreement may be executed in counterparts and by facsimile.',
   'content: Section 3 \n This Agreement constitutes the entire understanding between the parties relative to the protection of information which may be exchanged pursuant to this Agreement in connection with the potential transaction contemplated hereunder and supersedes any and all prior agreements or understandings between the parties regarding such subject matter. The term of this Agreement (except where otherwise stated as a shorter term) shall be from the date of this Agreement until July\xa031, 2017.']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[0.8497455440089557, 1.0974264072475823, 1.0987801472013827]],
 

In [90]:
with open(r"confidentiality_and_non_disclosure_agreement_11627.txt", "w") as f:
    f.write(text)

In [92]:
from backend.contract_analyzer.agents.template.extract_information import ExtractionProcessor

ext_proc = ExtractionProcessor()

ext_proc.process_extractions(text, new_collection, ag)

Extracting Information: 66it [03:27,  3.14s/it]


In [97]:
ext_proc.export_results(format="dataframe").to_excel("confidentiality_and_non_disclosure_agreement_11627.xlsx")

In [100]:
text = new_collection.query(
    query_texts =["Nature of Agreement"],
    n_results = 3
)

term = "Nature of Agreement"


response = chat(
    model="llama3.1",
    messages=[
        {
            "role": "user",
            "content": f"""From the following text {text} \n Extract the exact {term} and the section it is found. 
            Note that ids contains the list of Section from where the text belongs.
        
            
    Requirements:
    - Extract the specific value
    - Include the exact section/paragraph where the value was found 
    - Return "Not specified" if not found
    - No explanations or analysis
    
    Format: 
    Value: <extracted_value>
    Section: <relevant_section>
    
    Example Output:
    Governing Law: laws of the State of Iowa,
    Section: governingLaw
            
            """,
        }
    ],
)

response["message"]["content"]

'Here is the extracted value and section:\n\nValue: agreement\nSection: Section 5 \n\nNote that there are multiple agreements mentioned in different sections, but based on your requirements, I have provided the first occurrence. If you want all occurrences, please let me know!'

In [101]:
text = res["content"]

agreement_categories = [
    "Non Disclosure Agreement",
    "Consulting Agreement",
    "Service Agreement",
    "Employment Agreement",
    "Sales Agreement",
    "Lease Agreement",
    "Partnership Agreement",
]

response = chat(
    model="llama3.1",
    messages=[
        {
            "role": "user",
            "content": f"""From the following text {text[:4000]} 
            
            Agreement Categories: {agreement_categories}
            
            Extract the Agreement Category from the text.
            
            
    Requirements:
    - Extract the specific value
    - Include the exact section/paragraph where the value was found 
    - Return "Not specified" if not found
    - No explanations or analysis
    
    Format: 
    Value: <extracted_value>
    Section: <relevant_section>
    
    Example Output:
    Governing Law: laws of the State of Iowa,
    Section: governingLaw
            
            """,
        }
    ],
)

response["message"]["content"]

'Value: Non Disclosure Agreement\nSection: Agreement Categories'

In [102]:
new_collection.query(
    query_texts =["Contract Name"],
    n_results = 3
)

{'ids': [['third_party_consents', 'NOWTHEREFORE', 'restrictive_period']],
 'embeddings': None,
 'documents': [["content: third_party_consents \n {'discussions_and_negotiations': 'Pep Boys consents to the Interested Party having discussions and negotiations with third-parties related to securities and/or assets of Pep Boys.', 'previous_confidential_info': 'The third-party must have previously received Confidential Information from Pep Boys in connection with its strategic alternatives review process.'}",
   'content: NOWTHEREFORE \n For and in consideration of the promises and mutual obligations contained herein, the parties, intending to be legally bound, hereby agree as follows:',
   'content: restrictive_period \n From the date of this Agreement until January 31, 2017']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[1.2996260319501085, 1.3464318948451457, 1.3529352236974337]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.docu

In [107]:
text = res["content"]

contract_name = new_collection.query(
    query_texts =["Contract Details"],
    n_results = 3
)

response = chat(
    model="llama3.1",
    messages=[
        {
            "role": "user",
            "content": f"""From the following text  {contract_name} 
            
            Extract the Contract Details.
            
            
    Requirements:
    - Extract the specific value
    - Include the exact section/paragraph where the value was found 
    - Return "Not specified" if not found
    - No explanations or analysis
    
    Format: 
    Value: <extracted_value>
    Section: <relevant_section>
    
    Example Output:
    Governing Law: laws of the State of Iowa,
    Section: governingLaw
            
            """,
        }
    ],
)

response["message"]["content"]

'Here is the extracted Contract Details:\n\n1. Confidential Information includes all information which is marked ‘confidential’ or ‘proprietary’ and/or which the Receiving Party knows or, under all of the circumstances, should reasonably have known should be treated as confidential and any analysis, reports or other information prepared or compiled by the Receiving Party which is based upon or derived from the information provided by the Disclosing Party.\n   Section: section1\n\n2. Confidential Information also includes the fact that the parties are considering a potential transaction and are engaging in discussions with respect thereto and any of the terms, conditions, or other facts with respect to any potential transaction, including the status thereof (the Confidential Information described in this sentence collectively, the “Transaction Information”).\n   Section: section1\n\n3. Neither this Agreement nor the disclosure any Confidential Information shall (a) constitute or imply a

In [None]:
text = res["content"]

contract_name = new_collection.query(
    query_texts =["CScanned"],
    n_results = 3
)

response = chat(
    model="llama3.1",
    messages=[
        {
            "role": "user",
            "content": f"""From the following text  {text[:4000]} 
            
            Extract the Counterparty Name Only.
            
            
    Requirements:
    - Extract the specific value
    - Include the exact section/paragraph where the value was found 
    - Return "Not specified" if not found
    - No explanations or analysis
    
    Format: 
    Value: <extracted_value>
    Section: <relevant_section>
    
    Example Output:
    Governing Law: laws of the State of Iowa,
    Section: governingLaw
            
            """,
        }
    ],
)

response["message"]["content"]

'Here is the extracted counterparty name:\n\n Value: Icahn Enterprises L.P.\n Section: WHEREAS, on December 7, 2015, Interested Party provided a proposal to acquire Pep Boys for $15.50 per share in cash (as set forth in Pep Boys filing with the Securities and Exchange Commission dated December 7, 2015),'

In [114]:
new_collection.query(
    query_texts =["Scanned"],
    n_results = 3,
)

{'ids': [['section4', 'third_party_consents', 'no_solicitation']],
 'embeddings': None,
 'documents': [['content: section4 \n The Receiving Party agrees that upon the written request of the Disclosing Party, it will return to the Disclosing Party (or destroy and certify such destruction to the Disclosing Party) any and all written, electronic or tangible materials (including all copies) of Confidential Information in the Receiving Party’s possession. Notwithstanding the foregoing, the Receiving Party shall be permitted to retain (a) one copy of the Confidential Information in order to comply with any applicable law, court, regulation or regulatory authority or to comply with existing internal document retention policies and (b) electronic copies of the Confidential Information created pursuant to standard archival or back-up procedures.',
   "content: third_party_consents \n {'discussions_and_negotiations': 'Pep Boys consents to the Interested Party having discussions and negotiations 

In [115]:
terms = [
    "Contract Name: The formal title or identifier given to the agreement document.",
    "Agreement Type: The category or classification of the contract (e.g., service, lease, employment).",
    "Country of Agreement: The nation where the contract is executed and/or governed.",
    "Entity Name: The legal name of the primary organization entering into the agreement.",
    "Counterparty Name: The legal name of the other party entering into the agreement.",
    "Summary: Brief overview of the contract's main purpose and terms.",
    "Department of Contract Owner: The specific division or unit responsible for managing the contract.",
    "SPOC: Single Point of Contact - the designated person responsible for contract-related communications.",
    "Agreement Group: The category or family of agreements this contract belongs to.",
    "Family Agreement: Indicates if the contract is part of a larger group of related agreements.",
    "Family Documents Present: Lists whether related agreement documents exist.",
    "Family Hierarchy: The structure showing how this contract relates to other family agreements.",
    "Scanned: Indicates whether the document has been digitized.",
    "Signature by: Names of authorized individuals who signed the contract.",
    "Effective Date: The date when the contract becomes legally binding.",
    "Contract Start Date: The date when contract performance begins.",
    "Contract Duration: The length of time the contract remains valid.",
    "Contract End Date: The date when the contract terminates.",
    "Contingent Contract: Agreement whose execution depends on certain conditions being met.",
    "Perpetual Contract: Agreement with no specified end date.",
    "SLA: Service Level Agreement - defines expected performance standards.",
    "Stamping Date: Date when the contract was officially stamped.",
    "Franking Date: Date when the contract was franked (payment of stamp duty).",
    "Franking Date_Availability: Indicates if franking date information is present.",
    "Governing Law: The legal system that applies to the contract.",
    "Dispute Resolution: Method agreed upon to resolve disagreements.",
    "Place of Courts: Location of courts having jurisdiction over disputes.",
    "Court Jurisdiction: The legal authority of courts over contract matters.",
    "Place of Arbitration: Physical location where arbitration will be conducted.",
    "Arbitration Institution: Organization managing the arbitration process.",
    "Number of Arbitrators: Quantity of arbitrators to be appointed.",
    "Seat of Arbitration: Legal jurisdiction for arbitration proceedings.",
    "Venue of Arbitration: Physical location where arbitration hearings take place.",
    "Legal Action Rights: Rights of parties to pursue legal remedies.",
    "Liability Cap: Maximum financial responsibility for damages.",
    "Liability Limitation Summary: Overview of restrictions on legal responsibility.",
    "Indemnification: Obligation to compensate for losses or damages.",
    "Liquidated Damages: Predetermined compensation for specific breaches.",
    "Damages Summary: Overview of compensation for contract breaches.",
    "Penalties: Consequences for contract violations.",
    "Penal Interest Rate: Additional charges for late payments.",
    "Assignment Rights: Authority to transfer contract rights to others.",
    "Termination for Convenience: Right to end contract without cause.",
    "Notice Days: Required advance notification period for termination.",
    "Lock-in Period: Duration parties must maintain the agreement.",
    "Change of Control Provision: Rules regarding ownership changes.",
    "Auto-renewal Provision: Terms for automatic contract extension.",
    "Acceleration Clause: Conditions triggering immediate payment/performance.",
    "Exclusivity Provision: Rights to exclusive business relationship.",
    "Audit Rights: Authority to examine records and operations.",
    "Intellectual Property Rights: Ownership of patents, copyrights, trademarks.",
    "ABAC/FCPA Provision: Anti-bribery and corruption compliance terms.",
    "Payment Terms: Schedule and conditions for financial transactions.",
    "Security Deposit: Funds held as performance guarantee.",
    "Revenue Share: Division of income between parties.",
    "Commission Percentage: Agreed rate for sales commission.",
    "Minimum Guarantee: Lowest assured payment amount.",
    "Variable Fee: Charges that vary based on conditions.",
    "Confidentiality: Requirements for protecting sensitive information.",
    "Data Privacy: Rules for handling personal information.",
    "Insurance Coverage: Required insurance policies and amounts.",
    "Subcontracting Rights: Authority to delegate work to third parties.",
    "Defect Liability Period: Duration of responsibility for defects.",
    "Performance Guarantee: Assurance of satisfactory execution.",
    "Force Majeure: Unforeseeable circumstances preventing fulfillment.",
    "Non-Compete: Restrictions on competitive activities.",
    "Non-Solicitation: Prohibition on hiring other party's employees.",
    "Waiver: Voluntary relinquishment of rights.",
    "Severability: Survival of remaining terms if others are invalid.",
    "Survival: Contract provisions that continue after termination.",
    "Review Comments: Notes from contract review process.",
]

In [116]:
for i, term in enumerate(terms):
    print(i,".", term)

0 . Contract Name: The formal title or identifier given to the agreement document.
1 . Agreement Type: The category or classification of the contract (e.g., service, lease, employment).
2 . Country of Agreement: The nation where the contract is executed and/or governed.
3 . Entity Name: The legal name of the primary organization entering into the agreement.
4 . Counterparty Name: The legal name of the other party entering into the agreement.
5 . Summary: Brief overview of the contract's main purpose and terms.
6 . Department of Contract Owner: The specific division or unit responsible for managing the contract.
7 . SPOC: Single Point of Contact - the designated person responsible for contract-related communications.
8 . Agreement Group: The category or family of agreements this contract belongs to.
9 . Family Agreement: Indicates if the contract is part of a larger group of related agreements.
10 . Family Documents Present: Lists whether related agreement documents exist.
11 . Family H

In [117]:
terms = ["Contract Name",
"Agreement Type",
"Country of agreement",
"Contract Details",
" Entity Name",
"Counterparty Name",
"Summary",
"Department of Contract Owner",
"SPOC",
"Agreement Group",
"Family Agreement",
"Family Documents Present",
"Family Hierarchy",
"Scanned",
" Signature by:",
"Effective Date",
"Contract Start Date",
"Contract Duration",
"Contract End Date",
"Contingent Contract",
"Perpetual Contract",
"SLA",
"Stamping Date",
"Franking Date",
"Franking Date_Availablity",
"Governing Law",
"Dispute Resolution",
"Place of Courts",
"Court Jurisdiction",
"Place of Arbitration",
"Arbitration Institution",
"Number of Arbitrators",
"Seat of Arbitration",
"Venue of Arbitration",
"Legal Action Rights with counterparty",
"Counterparty - liability cap",
" - liability cap",
"Counterparty - liability limitation summary",
" - liability limitation summary",
"Indemnification",
"Indemnification Summary",
"Counterparty - liquidated damages",
" - liquidated damages",
"Counterparty - damages summary",
" - damages summary",
"Penalties",
"Penal interest rate and other late payment charges",
" assignment rights",
" - assignment summary",
"Counterparty assignment rights",
"Counterparty - assignment summary",
"Can  terminate for Convenience?",
"If yes, number of notice days?",
"Can Counterparty terminate for Convenience?",
"Counterparty - If yes, number of notice days?",
" - termination summary",
"Counterparty - termination summary",
"Provision for lock-in period",
"Period of lock in.",
"Lock-in summary",
" - Change of Control Provision",
"Counterparty  - Change of Control Provision",
"Auto-renewal provision",
"Notice period (in days) to stop auto renewal",
"Renewal Option Notice Start Date",
"Renewal Option Notice End Date",
"Auto-renewal provision summary",
"Acceleration clause applicable to ",
"Acceleration clause applicable to Counterparty",
"Acceleration clause - summary",
"Exclusivity provision",
"Scope",
"Territory",
"Carve-outs",
"Exclusivity Period (Start Date)",
"Exclusivity Period (End Date)",
"Available to ",
"Available to Counterparty",
"Audit Rights - Summary",
"Copyright",
"Patent",
"Trademark",
"Other",
"ABAC/FCPA provision",
"ABAC/FCPA provision - summary",
"Receive or Pay",
"Currency",
"Total Contract Value",
"Fixed Fee",
"Security Deposit / Bank Guarantee",
"Fuel surcharges",
"Advance payment period",
"Advance payment Amount",
"Term for Refund of Security Deposit",
"Incentive",
"Revenue Share",
"Commission Percentage",
"Minimum Guarantee",
"Variable Fee",
"Fee-Other",
"Payment Type",
"Payment Schedule (in days)",
"Payment Terms / Details",
"Milestones",
"Payment to Affiliates / Agency",
"Fee Escalation",
"Stamp Duty Share",
"Confidentiality",
"Residual Confidentiality",
"Exceptions to confidentiality",
"Term (In months)",
"Data Privacy Provision",
"Data Privacy Summary",
"Insurance coverage for ",
"Insurance coverage for Counterparty",
"Subcontracting rights for the  Counterpart",
"Defect liability period",
"Performance Guarantee",
"Conflicts of Interests",
"Force Majeure",
"Insurance coverage",
"Representation and Warranties",
"Non-Compete",
"Non-Solicitation",
"Waiver",
"Severability",
"Survival",
"Handwritten Comments",
"Missing Pages",
"Missing Signatures",
"Review Comments (if any)",]


In [119]:
for i, term in enumerate(terms):
    print(i+1,".", term)

1 . Contract Name
2 . Agreement Type
3 . Country of agreement
4 . Contract Details
5 .  Entity Name
6 . Counterparty Name
7 . Summary
8 . Department of Contract Owner
9 . SPOC
10 . Agreement Group
11 . Family Agreement
12 . Family Documents Present
13 . Family Hierarchy
14 . Scanned
15 .  Signature by:
16 . Effective Date
17 . Contract Start Date
18 . Contract Duration
19 . Contract End Date
20 . Contingent Contract
21 . Perpetual Contract
22 . SLA
23 . Stamping Date
24 . Franking Date
25 . Franking Date_Availablity
26 . Governing Law
27 . Dispute Resolution
28 . Place of Courts
29 . Court Jurisdiction
30 . Place of Arbitration
31 . Arbitration Institution
32 . Number of Arbitrators
33 . Seat of Arbitration
34 . Venue of Arbitration
35 . Legal Action Rights with counterparty
36 . Counterparty - liability cap
37 .  - liability cap
38 . Counterparty - liability limitation summary
39 .  - liability limitation summary
40 . Indemnification
41 . Indemnification Summary
42 . Counterparty - li

In [120]:
extraction_types = [
            "Contract Name",
            "Agreement Type",
            "Country of agreement",
            "Contract Details",
            " Entity Name",
            "Counterparty Name",
            "Summary",
            "Department of Contract Owner",
            "SPOC",
            "Agreement Group",
            "Family Agreement",
            "Family Documents Present",
            "Family Hierarchy",
            "Scanned",
            " Signature by:",
            "Effective Date",
            "Contract Start Date",
            "Contract Duration",
            "Contract End Date",
            "Contingent Contract",
            "Perpetual Contract",
            "SLA",
            "Stamping Date",
            "Franking Date",
            "Franking Date_Availablity",
            "Governing Law",
            "Dispute Resolution",
            "Place of Courts",
            "Court Jurisdiction",
            "Place of Arbitration",
            "Arbitration Institution",
            "Number of Arbitrators",
            "Seat of Arbitration",
            "Venue of Arbitration",
            "Legal Action Rights with counterparty",
            "Counterparty - liability cap",
            "Counterparty - liability limitation summary",
            "Indemnification",
            "Indemnification Summary",
            "Counterparty - liquidated damages",
            "Counterparty - damages summary",
            "Penalties",
            "Penal interest rate and other late payment charges",
            " assignment rights",
            "Counterparty assignment rights",
            "Counterparty - assignment summary",
            "Can  terminate for Convenience?",
            "If yes, number of notice days?",
            "Can Counterparty terminate for Convenience?",
            "Counterparty - If yes, number of notice days?",
            "Counterparty - termination summary",
            "Provision for lock-in period",
            "Period of lock in.",
            "Lock-in summary",
            "Counterparty  - Change of Control Provision",
            "Auto-renewal provision",
            "Notice period (in days) to stop auto renewal",
            "Renewal Option Notice Start Date",
            "Renewal Option Notice End Date",
            "Auto-renewal provision summary",
            "Acceleration clause applicable to ",
            "Acceleration clause applicable to Counterparty",
            "Acceleration clause - summary",
            "Exclusivity provision",
            "Scope",
            "Territory",
            "Carve-outs",
            "Exclusivity Period (Start Date)",
            "Exclusivity Period (End Date)",
            "Available to ",
            "Available to Counterparty",
            "Audit Rights - Summary",
            "Copyright",
            "Patent",
            "Trademark",
            "Other",
            "ABAC/FCPA provision",
            "ABAC/FCPA provision - summary",
            "Receive or Pay",
            "Currency",
            "Total Contract Value",
            "Fixed Fee",
            "Security Deposit / Bank Guarantee",
            "Fuel surcharges",
            "Advance payment period",
            "Advance payment Amount",
            "Term for Refund of Security Deposit",
            "Incentive",
            "Revenue Share",
            "Commission Percentage",
            "Minimum Guarantee",
            "Variable Fee",
            "Fee-Other",
            "Payment Type",
            "Payment Schedule (in days)",
            "Payment Terms / Details",
            "Milestones",
            "Payment to Affiliates / Agency",
            "Fee Escalation",
            "Stamp Duty Share",
            "Confidentiality",
            "Residual Confidentiality",
            "Exceptions to confidentiality",
            "Term (In months)",
            "Data Privacy Provision",
            "Data Privacy Summary",
            "Insurance coverage for ",
            "Insurance coverage for Counterparty",
            "Subcontracting rights for the  Counterpart",
            "Defect liability period",
            "Performance Guarantee",
            "Conflicts of Interests",
            "Force Majeure",
            "Insurance coverage",
            "Representation and Warranties",
            "Non-Compete",
            "Non-Solicitation",
            "Waiver",
            "Severability",
            "Survival",
            "Handwritten Comments",
            "Missing Pages",
            "Missing Signatures",
            "Review Comments (if any)",
        ]

In [123]:
text = res["content"]

In [124]:
from phi.agent import Agent

from phi.model.ollama import Ollama


ag = Agent(
    model=Ollama(
        id="llama3.3",
        config={
            "temperature": 0.9,
        },
    )
)

In [131]:
prompt = f"""From the following text {text} 


Extract the following fields from the OCR content of the contract document:
Do not Hallucinate the data. If the data is not present in the OCR content, do not make any assumptions.

maintain the sequence of the fields as per the contract
just return the values which are present


{extraction_types[:10]}

Note  return in a json format with the field name and the extracted value only if the value is present in the OCR content
"""

In [132]:
response = ag.run(prompt)

In [133]:
print(response.content)

Here are the extracted fields in JSON format:

```
{
  "Contract Name": "CONFIDENTIALITY AND NON-DISCLOSURE AGREEMENT",
  "Entity Name": "Pep Boys",
  "Counterparty Name": "Icahn Enterprises L.P."
}
```


In [130]:
extraction_types[:10]

['Contract Name',
 'Agreement Type',
 'Country of agreement',
 'Contract Details',
 ' Entity Name',
 'Counterparty Name',
 'Summary',
 'Department of Contract Owner',
 'SPOC',
 'Agreement Group']