In [2]:
from docx import Document
from pathlib import Path
from typing import Dict , Any


class StructuredProcessor():
    SUPPORTED_FORMATS = {
        '.json': 'json',
        '.xml': 'xml',
        '.xlsx': 'excel',
        '.xls': 'excel',
        '.md': 'markdown',
        '.txt': 'text',
        '.docx': 'docx',  # Added support for .docx
        '.doc': 'docx'  # Added support for .doc    
    }
    
    
    
    def _process_docx(self, file_path: Path) -> Dict[str, Any]:
        """Process DOCX files."""
        doc = Document(file_path)
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
        word_count = sum(len(para.split()) for para in paragraphs)
        
        return {
            'content': '\n'.join(paragraphs),
            'metadata': {
                'format': 'docx',
                'size': file_path.stat().st_size,
                'paragraphs': len(paragraphs),
                'word_count': word_count
            }
        }


In [12]:
path = r"/home/ajay/contracts_v2/data/master-service-agreement-template-28.docx"
sp = StructuredProcessor()
res = sp._process_docx(Path(path))

with open("msa_28.txt", "w") as f:
    f.write(res['content'])
    
    
with open("msa_28.txt", "r") as f:
    text = f.read()

In [3]:
path = r"/home/ajay/contracts_v2/data/master-service-agreement-template-25.docx"

struct = StructuredProcessor()

res = struct._process_docx(Path(path))

In [4]:
res['content']

'MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND \nMAINTENANCE OF SOLAR SYSTEMS \nTHIS MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF A SYSTEM (“Agreement”) is made the ____ day of ________________ 20___ (the “Effective Date”)\nBY AND BETWEEN: \nThe Government of Bermuda as described in Schedule 1, hereinafter referred to as the “Government”, “we”, “our” or “us”, of the first part; and\nThe supplier of service under this Agreement, whose name and contact details are set out in Schedule 1 to this Agreement and is hereinafter referred to as “Supplier” or “you”, of the second part.  \nThe Government and the Supplier are individually referred to as a “party” and collectively as the “parties”.\nThis Agreement which consists of the General Terms and Conditions, Schedule 1 and Appendix 1 to 6, sets out the terms and conditions upon which the Supplier will provide service to the Government.\nGENERAL TERMS AND CONDITIONS\nIN CONSIDERATION of the premises and mutual p

In [5]:
with open("msa_25.txt", "w") as f:
    f.write(res['content'])

In [6]:
def extract_capital_lines(text):
    """
    Extract full lines containing capital words, including multi-line phrases.
    
    Args:
        text (str): Input text to process
        
    Returns:
        list: List of lines/phrases containing capital words
    """
    # Split the text into lines
    lines = text.split('\n')
    
    # Store the capital phrases
    capital_phrases = []
    current_phrase = []
    
    for line in lines:
        # Remove leading/trailing whitespace
        line = line.strip()
        
        # If line is empty, process any accumulated phrase
        if not line:
            if current_phrase:
                capital_phrases.append(' '.join(current_phrase))
                current_phrase = []
            continue
            
        # Check if the line contains any words
        words = line.split()
        if not words:
            continue
            
        # Check if all non-empty words in the line are capitalized
        is_capital_line = all(word.isupper() or not any(c.isalpha() for c in word) 
                            for word in words)
        
        if is_capital_line:
            current_phrase.append(line)
        else:
            if current_phrase:
                capital_phrases.append(' '.join(current_phrase))
                current_phrase = []
    
    # Add any remaining phrase
    if current_phrase:
        capital_phrases.append(' '.join(current_phrase))
    
    return capital_phrases

# Example usage:
text = res['content']

result = extract_capital_lines(text)
for phrase in result:
    print(f'"{phrase}"')

"MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF SOLAR SYSTEMS"
"BY AND BETWEEN:"
"GENERAL TERMS AND CONDITIONS"
"SCHEDULE 1"
"APPENDIX 1 DESIGN AND INSTALLATION OF THE SYSTEM"
"SCOPE OF WORKS FOR ROOFTOP PV SOLAR SYSTEM DESIGN, INSTALLATION AND MAINTENANCE AT THE FOLLOWING VARIOUS BUILDINGS:"
"SECTION I SCOPE OF SERVICES"
"APPENDIX 2 SUPPLY OF GOODS AND SERVICE FOR THE SYSTEM"
"APPENDIX 3 ACCEPTANCE TEST AND ACCEPTANCE CERTIFICATE"
"ACCEPTANCE CERTIFICATE"
"APPENDIX 4"
"APPENDIX 5 MAINTENANCE AND SUPPORT"
"SCOPE OF MAINTENANCE AND SUPPORT SERVICE"


In [7]:
def extract_document_structure(text):
    """
    Extract document structure into a dictionary with sections, subsections, and content.
    
    Args:
        text (str): Input text to process
        
    Returns:
        dict: Nested dictionary with document structure
    """
    # Split text into lines
    lines = text.split('\n')
    
    # Initialize variables
    structure = {}
    current_section = None
    current_subsection = None
    current_content = []
    
    def is_main_heading(line):
        """Check if line is a main heading (starts with capital letter, no period at end)"""
        return (line.strip() and 
                line.strip()[0].isupper() and 
                not line.strip().endswith('.') and 
                not ':' in line and
                len(line.strip().split()) == 1)
    
    def is_subheading(line):
        """Check if line is a subheading (contains descriptive text after heading)"""
        return (line.strip() and 
                line.strip()[0].isupper() and 
                ':' in line)
    
    # Process each line
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if is_main_heading(line):
            # If there was previous content, save it
            if current_section and current_subsection and current_content:
                if current_section not in structure:
                    structure[current_section] = {}
                structure[current_section][current_subsection] = ' '.join(current_content)
            
            current_section = line
            current_subsection = None
            current_content = []
            structure[current_section] = {}
            
        elif is_subheading(line):
            # If there was previous content, save it
            if current_section and current_subsection and current_content:
                structure[current_section][current_subsection] = ' '.join(current_content)
            
            parts = line.split(':', 1)
            current_subsection = parts[0].strip()
            current_content = [parts[1].strip()] if len(parts) > 1 and parts[1].strip() else []
            
        elif current_section and current_subsection:
            current_content.append(line)
            
    # Save final content if exists
    if current_section and current_subsection and current_content:
        structure[current_section][current_subsection] = ' '.join(current_content)
    
    return structure

result = extract_document_structure(text)

# Print the result in a formatted way
import json
print(json.dumps(result, indent=2))

{
  "Definitions": {
    "In this Agreement, unless the context otherwise requires, the expressions set forth below have the following meanings in any schedules or annexes hereto": "\u201cAcceptance\u201d means written confirmation by us that the System is accepted in accordance with the acceptance criteria for Acceptance, set out in Appendix 3; \u201cAcceptance Tests\u201d means the activities to be carried out to verify that the System and Service are in accordance with the acceptance test criteria set out in Appendix 3; \u201cAgreement\u201d means this Master Service Agreement for the Design, Installation and Maintenance of a System and includes these General Terms and Conditions, Schedule 1 and Appendix 1 to Appendix 5; \u201cAppendix 1\u201d contains details of the design and installation specifications for the System which will comply with the Documents and the associated Fee; \u201cAppendix 2\u201d contains details for the supply of the Goods and Service for the System and the a

In [9]:
def extract_document_structure(text):
    """
    Extract document structure into a dictionary with sections, subsections, and content.
    
    Args:
        text (str): Input text to process
        
    Returns:
        dict: Nested dictionary with document structure
    """
    # Split text into lines
    lines = text.split('\n')
    
    # Initialize variables
    structure = {}
    current_section = None
    current_subsection = None
    current_content = []
    
    def is_main_heading(line):
        """Check if line is a main heading"""
        # Check for fully capitalized headings (like "APPENDIX 5")
        if line.strip() and line.strip().isupper():
            return True
        # Check for regular headings (single word, starts with capital, no period)
        return (line.strip() and 
                line.strip()[0].isupper() and 
                not line.strip().endswith('.') and 
                len(line.strip().split()) == 1)
    
    def is_subheading(line):
        """Check if line is a subheading"""
        # Check for fully capitalized subheadings
        if line.strip() and line.strip().isupper():
            return True
        # Check for regular subheadings (with colon)
        return (line.strip() and 
                line.strip()[0].isupper() and 
                ':' in line)
    
    def process_current_content():
        """Helper function to process and save current content"""
        if current_section and current_content:
            if current_subsection:
                if current_section not in structure:
                    structure[current_section] = {}
                structure[current_section][current_subsection] = ' '.join(current_content)
            else:
                structure[current_section] = {"General": ' '.join(current_content)}
    
    # Process each line
    previous_line_was_heading = False
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if is_main_heading(line):
            # Process any existing content before moving to new section
            process_current_content()
            
            current_section = line
            current_subsection = None
            current_content = []
            previous_line_was_heading = True
            if current_section not in structure:
                structure[current_section] = {}
            
        elif is_subheading(line) and previous_line_was_heading:
            # If the previous line was a heading and this is also a heading,
            # treat this as a subsection of the previous heading
            if current_section:
                current_subsection = line
                current_content = []
            previous_line_was_heading = True
            
        else:
            previous_line_was_heading = False
            current_content.append(line)
            
    # Process final content
    process_current_content()
    
    return structure


result = extract_document_structure(text)

# Print the result in a formatted way
import json
print(json.dumps(result, indent=2))

{
  "MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND": {},
  "MAINTENANCE OF SOLAR SYSTEMS": {
    "General": "THIS MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF A SYSTEM (\u201cAgreement\u201d) is made the ____ day of ________________ 20___ (the \u201cEffective Date\u201d)"
  },
  "BY AND BETWEEN:": {
    "General": "The Government of Bermuda as described in Schedule 1, hereinafter referred to as the \u201cGovernment\u201d, \u201cwe\u201d, \u201cour\u201d or \u201cus\u201d, of the first part; and The supplier of service under this Agreement, whose name and contact details are set out in Schedule 1 to this Agreement and is hereinafter referred to as \u201cSupplier\u201d or \u201cyou\u201d, of the second part. The Government and the Supplier are individually referred to as a \u201cparty\u201d and collectively as the \u201cparties\u201d. This Agreement which consists of the General Terms and Conditions, Schedule 1 and Appendix 1 to 6, sets out the terms and c

In [14]:
# test for msa_28

with open("msa_28.txt", "r") as f:
    text = f.read()

print(len(text))

result = extract_document_structure(text)

# Print the result in a formatted way
import json
print(json.dumps(result, indent=2))

13897
{
  "IOWA STATE UNIVERSITY": {},
  "MASTER SERVICE AGREEMENT": {
    "General": "This Master Service Agreement (\u201cAgreement\u201d) is entered into by Iowa State University of Science and Technology, on behalf of its <ISU UNIT NAME AND ADDRESS>, Ames, Iowa 50011 (\u201cISU\u201d), and <CUSTOMER NAME> (\u201cCustomer\u201d), <CUSTOMER ADDRESS>.  The effective date of this Agreement shall be the date on which the last party signs this Agreement (\u201cEffective Date\u201d). Scope and Performance of Services. Customer has expressed a need to use the services of ISU from time to time on various projects.  Customer and ISU shall enter into an Individual Project Agreement (\u201cIPA\u201d) for each project for which Customer desires ISU to provide services. The IPA shall be substantially in the form of Attachment A.  Each IPA shall be deemed part of and incorporated into this Agreement.  ISU shall perform the services described in the IPA (the \u201cServices\u201d).  Unless stated o

In [8]:
def extract_document_structure(text):
    """
    Extract document structure into a dictionary with sections, subsections, and content.
    
    Args:
        text (str): Input text to process
        
    Returns:
        dict: Nested dictionary with document structure
    """
    # Split text into lines
    lines = text.split('\n')
    
    # Initialize variables
    structure = {}
    current_section = None
    current_subsection = None
    current_content = []
    
    def is_main_heading(line):
        """Check if line is a main heading (starts with capital letter, no period at end)"""
        return (line.strip() and 
                line.strip()[0].isupper() and 
                not line.strip().endswith('.') and 
                len(line.strip().split()) == 1)
    
    def is_subheading(line):
        """Check if line is a subheading (contains descriptive text after heading)"""
        return (line.strip() and 
                line.strip()[0].isupper() and 
                ':' in line)
    
    # Process each line
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if is_main_heading(line):
            # If there was previous content, save it
            if current_section and current_content:
                if current_subsection:
                    structure[current_section][current_subsection] = ' '.join(current_content)
                else:
                    # For sections without subsections, use "General" as subsection
                    structure[current_section] = {"General": ' '.join(current_content)}
            
            current_section = line
            current_subsection = None
            current_content = []
            if current_section not in structure:
                structure[current_section] = {}
            
        elif is_subheading(line):
            # If there was previous content, save it
            if current_section and current_subsection and current_content:
                structure[current_section][current_subsection] = ' '.join(current_content)
            
            parts = line.split(':', 1)
            current_subsection = parts[0].strip()
            current_content = [parts[1].strip()] if len(parts) > 1 and parts[1].strip() else []
            
        elif current_section:
            current_content.append(line)
            
    # Save final content if exists
    if current_section and current_content:
        if current_subsection:
            structure[current_section][current_subsection] = ' '.join(current_content)
        else:
            structure[current_section] = {"General": ' '.join(current_content)}
    
    return structure

result = extract_document_structure(text)

# Print the result in a formatted way
import json
print(json.dumps(result, indent=2))

{
  "Definitions": {
    "In this Agreement, unless the context otherwise requires, the expressions set forth below have the following meanings in any schedules or annexes hereto": "\u201cAcceptance\u201d means written confirmation by us that the System is accepted in accordance with the acceptance criteria for Acceptance, set out in Appendix 3; \u201cAcceptance Tests\u201d means the activities to be carried out to verify that the System and Service are in accordance with the acceptance test criteria set out in Appendix 3; \u201cAgreement\u201d means this Master Service Agreement for the Design, Installation and Maintenance of a System and includes these General Terms and Conditions, Schedule 1 and Appendix 1 to Appendix 5; \u201cAppendix 1\u201d contains details of the design and installation specifications for the System which will comply with the Documents and the associated Fee; \u201cAppendix 2\u201d contains details for the supply of the Goods and Service for the System and the a

In [10]:
import json
import chromadb
import tiktoken
from sentence_transformers import SentenceTransformer
from typing import List, Optional, Dict, Any
import logging
from functools import lru_cache
import os
import re
from chromadb.utils import embedding_functions
from backend.contract_analyzer.config import Config

logger = logging.getLogger(__name__)

import re

def extract_sections_to_dict(text):
    """
    Extracts sections from text and creates a dictionary with hierarchical structure
    including sections, subsections, and lettered points. Handles multiple sections
    with same numbers and saves introduction.
    """
    # Save introduction (first 1000-5000 characters)
    introduction = text[:min(len(text), 5000)]
    
    # Pattern to match main sections with name possibly on next line
    main_section_pattern = r'^\s*(\d+)\.\s*\n*([A-Z][A-Z\s\'\-]+)(?:\n|$)'
    
    # Pattern to match lettered sections
    letter_section_pattern = r'^\s*([A-Z])\s*\n*([A-Z][A-Z\s\'\-]+)(?:\n|$)'
    
    # Pattern to match sections without letters (like SCOPE OF WORK)
    regular_section_pattern = r'^([A-Z][A-Z\s\'\-]+)(?:\n|$)'
    
    # Pattern to match subsections
    subsection_pattern = r'^\s*(\d+\.\d+)\s*$'
    
    # Pattern to match lettered points
    letter_pattern = r'^\s*\(([a-z])\)\s*$'
    
    # Pattern for special sections
    whereas_pattern = r'^WHEREAS\s'
    signatories_pattern = r'^SIGNATORIES\s*$'
    
    sections = {
        'introduction': introduction,
        'sections': {}
    }
    current_section = None
    current_section_name = None
    current_subsection = None
    current_letter = None
    current_content = []
    
    lines = text.split('\n')
    i = 0
    
    while i < len(lines):
        current_line = lines[i].strip()
        next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
        combined_lines = f"{current_line}\n{next_line}"
        
        # Check for WHEREAS section
        whereas_match = re.match(whereas_pattern, current_line)
        if whereas_match:
            if current_content:
                _save_current_content(sections['sections'], current_section,
                                   current_section_name, current_subsection,
                                   current_letter, current_content)
            current_section = 'whereas'
            current_section_name = 'WHEREAS'
            sections['sections'][current_section] = {
                'name': current_section_name,
                'content': '',
                'subsections': {}
            }
            current_content = [current_line]
            i += 1
            continue
            
        # Check for SIGNATORIES section
        sig_match = re.match(signatories_pattern, current_line)
        if sig_match:
            if current_content:
                _save_current_content(sections['sections'], current_section,
                                   current_section_name, current_subsection,
                                   current_letter, current_content)
            current_section = 'signatories'
            current_section_name = 'SIGNATORIES'
            sections['sections'][current_section] = {
                'name': current_section_name,
                'content': '',
                'subsections': {}
            }
            current_content = []
            i += 1
            continue
        
        # Check for regular sections without letters
        regular_match = re.match(regular_section_pattern, current_line)
        if regular_match:
            section_name = regular_match.group(1).strip()
            # Skip if it's likely a false positive or part of another section
            if (len(section_name) < 3 or 
                section_name in ['CEO', 'CFO'] or 
                current_line.startswith('Email:') or
                current_line.startswith('Address:')):
                current_content.append(current_line)
                i += 1
                continue
                
            if current_content:
                _save_current_content(sections['sections'], current_section,
                                   current_section_name, current_subsection,
                                   current_letter, current_content)
            
            section_key = section_name.lower().replace(' ', '_')
            if section_key not in sections['sections']:
                sections['sections'][section_key] = {
                    'name': section_name,
                    'content': '',
                    'subsections': {}
                }
            current_section = section_key
            current_section_name = section_name
            current_subsection = None
            current_letter = None
            current_content = []
            i += 1
            continue
            
        # Check for lettered sections (like A, B, C)
        letter_match = re.match(letter_section_pattern, combined_lines)
        if letter_match:
            if current_content:
                _save_current_content(sections['sections'], current_section,
                                   current_section_name, current_subsection,
                                   current_letter, current_content)
            
            section_letter = letter_match.group(1)
            section_name = letter_match.group(2).strip()
            section_key = f"{section_letter}_{section_name.lower().replace(' ', '_')}"
            
            if section_key not in sections['sections']:
                sections['sections'][section_key] = {
                    'letter': section_letter,
                    'name': section_name,
                    'content': '',
                    'subsections': {}
                }
            current_section = section_key
            current_section_name = section_name
            current_subsection = None
            current_letter = None
            current_content = []
            i += 2
            continue
        
        # Check for subsection
        subsection_match = re.match(subsection_pattern, current_line)
        if subsection_match:
            if current_content:
                _save_current_content(sections['sections'], current_section,
                                   current_section_name, current_subsection,
                                   current_letter, current_content)
            
            current_subsection = subsection_match.group(1)
            if current_section and current_subsection:
                if 'subsections' not in sections['sections'][current_section]:
                    sections['sections'][current_section]['subsections'] = {}
                if current_subsection not in sections['sections'][current_section]['subsections']:
                    sections['sections'][current_section]['subsections'][current_subsection] = {
                        'content': '',
                        'letters': {}
                    }
            current_letter = None
            current_content = []
            i += 1
            continue
        
        # Check for lettered points
        point_match = re.match(letter_pattern, current_line)
        if point_match:
            if current_content:
                _save_current_content(sections['sections'], current_section,
                                   current_section_name, current_subsection,
                                   current_letter, current_content)
            
            current_letter = point_match.group(1)
            current_content = []
            i += 1
            continue
        
        if current_line:
            current_content.append(current_line)
        i += 1
    
    # Save final content
    if current_content:
        _save_current_content(sections['sections'], current_section,
                           current_section_name, current_subsection,
                           current_letter, current_content)
    
    return sections

def _save_current_content(sections, section, section_name, subsection, letter, content):
    """Helper function to save content at appropriate level"""
    if not section or not sections:
        return
    
    content_text = ' '.join(content).strip()
    if not content_text:
        return
        
    if subsection:
        if letter:
            if letter not in sections[section]['subsections'][subsection]['letters']:
                sections[section]['subsections'][subsection]['letters'][letter] = ''
            sections[section]['subsections'][subsection]['letters'][letter] = content_text
        else:
            sections[section]['subsections'][subsection]['content'] = content_text
    else:
        sections[section]['content'] = content_text

def process_file(text: str):
    """Processes input file and returns structured dictionary with sections"""
    try:
        sections = extract_sections_to_dict(text)
        return sections
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
    
    
class VectorDB:
    """Core vector database operations"""

    def __init__(self):
        """Initialize database components"""
        self.active_collection = None
        self._init_components()
        self.logger = logging.getLogger(__name__)

    def _init_components(self):
        """Initialize required database components"""
        try:
            db_path = str(Config.CHROMA_DB_PATH)
            os.makedirs(db_path, exist_ok=True)
            
            self.client = chromadb.PersistentClient(path=db_path)
            self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
            
        except Exception as e:
            self.logger.error(f"VectorDB initialization failed: {str(e)}")
            raise

    @lru_cache(maxsize=100)
    def _compute_embedding(self, text: str) -> List[float]:
        """
        Compute embedding for text with caching
        
        Args:
            text: Text to embed
            
        Returns:
            List of embedding values
        """
        return self.model.encode([text], normalize_embeddings=True).tolist()[0]


    def create_collection(self, collection_name: str) -> bool:
        """
        Create a new collection
        
        Args:
            collection_name: Name for the collection
            
        Returns:
            Success status
        """
        try:
            safe_name = self._sanitize_collection_name(collection_name)
            # print("creating collection ", safe_name)
            if not self._collection_exists(safe_name):
                logging.info(f"Creating new collection: {safe_name}")
                self.active_collection = self.client.create_collection(
                    name=safe_name,
                    embedding_function= self.embedding_fn,
                    metadata={"name": safe_name}
                    )
                logging.info(f"Created new collection: {safe_name}")
                self.logger.info(f"Created new collection: {safe_name}")
            else:
                self.active_collection = self.client.get_collection(name=safe_name)
                self.logger.info(f"Using existing collection: {safe_name}")
            return True
            
        except Exception as e:
            self.logger.error(f"Collection creation failed: {str(e)}")
            return False

    def set_active_collection(self, collection_name: str) -> bool:
        """
        Set the active collection for operations
        
        Args:
            collection_name: Name of collection to activate
            
        Returns:
            Success status
        """
        try:
            safe_name = self._sanitize_collection_name(collection_name)
            if not self._collection_exists(safe_name):
                self.logger.error(f"Collection not found: {safe_name}")
                return False
                
            self.active_collection = self.client.get_collection(
                name=safe_name,
                embedding_function= self.embedding_fn
                )
            self.logger.info(f"Set active collection to: {safe_name}")
            return True
            
        except Exception as e:
            self.logger.error(f"Failed to set active collection: {str(e)}")
            return False

    def prepare_documents(self, sections: Dict) -> List[Dict]:
        """
        Creates documents with complete sections including all subsections and letters.
        Handles both lettered and non-lettered sections.
        """
        documents = []
        
        # First handle the introduction if present
        if 'introduction' in sections:
            documents.append({
                'id': 'introduction',
                'text': sections['introduction'],
                'metadata': {
                    'name': 'Introduction'
                }
            })
        
        # Process all sections
        for section_key, section_data in sections.get('sections', {}).items():
            full_text = []
            
            # Handle different section types
            if section_key in ['whereas', 'signatories']:
                # Special sections
                full_text.append(section_data['name'])
                if section_data.get('content'):
                    full_text.append(section_data['content'])
            else:
                # Regular or lettered sections
                section_header = section_data['name']
                if 'letter' in section_data:
                    section_header = f"{section_data['letter']}. {section_header}"
                full_text.append(section_header)
                
                # Add main content if exists
                if section_data.get('content'):
                    full_text.append(section_data['content'])
                
                # Add subsections if they exist
                for subsec_num, subsec_data in section_data.get('subsections', {}).items():
                    full_text.append(f"\nSubsection {subsec_num}:")
                    if subsec_data.get('content'):
                        full_text.append(subsec_data['content'])
                    
                    # Add lettered points if they exist
                    for letter, content in subsec_data.get('letters', {}).items():
                        if content:
                            full_text.append(f"\n({letter}) {content}")
            
            # Create metadata
            metadata = {
                'name': section_data['name']
            }
            
            # Add letter to metadata if it exists
            if 'letter' in section_data:
                metadata['letter'] = section_data['letter']
            
            # Generate document ID
            doc_id = section_key.lower().replace(' ', '_')
            
            documents.append({
                'id': doc_id,
                'text': '\n'.join(full_text),
                'metadata': metadata
            })
            
        
        # save the document as a json file
        
        with open('documents.json', 'w') as f:
            json.dump(documents, f)
        
        
        return documents
    
    def add_documents(
        self, 
        texts: str,
    ) -> bool:
        """
        Add documents to the active collection
        
        Args:
            docs: List of documents to add
            metadatas: Optional metadata for each document
            
        Returns:
            Success status
        """
        if not self.active_collection:
            print("********No active collection")
            self.logger.error("No active collection")
            return False
            
        try:
            # creating documents
            
            docs = process_file(texts)
            
            print("********Documents processed")
            
            documents = self.prepare_documents(docs)
            
            print("********Documents prepared")
            
            ids = [doc['id'] for doc in documents]
            texts = [doc['text'] for doc in documents]
            metadatas = [doc['metadata'] for doc in documents]
            
            self.logger.info(f"Adding {len(documents)} documents to collection")
            
            print(f"********Adding {len(documents)} documents to collection")
            # adding documents to collection
            self.active_collection.add(
                ids=ids,
                documents=texts,
                metadatas=metadatas,
            )
            
            self.logger.info(f"Added {len(documents)} documents to collection")
            
            print("********Documents added")
            
            return True
            
        except Exception as e:
            self.logger.error(f"Document addition failed: {str(e)}")
            return False

    def get_documents(
        self, 
        ids: Optional[List[str]] = None
    ) -> Optional[Dict[str, List]]:
        """
        Get documents from active collection
        
        Args:
            ids: Optional list of document IDs to retrieve
            
        Returns:
            Dictionary containing documents and metadata
        """
        if not self.active_collection:
            print("********No active collection while getting documents")
            self.logger.error("No active collection")
            return None
            
        try:
            return self.active_collection.get(ids=ids)
        except Exception as e:
            self.logger.error(f"Document retrieval failed: {str(e)}")
            return None

    def get_context(
        self, 
        query: str, 
        num_results: int = 3
    ) -> Optional[str]:
        """
        Get relevant context for a query
        
        Args:
            query: Search query
            num_results: Number of results to return
            
        Returns:
            Combined context string
        """
        
        if not self.active_collection:
            print("********No active collection while getting context")
            self.logger.error("No active collection")
            return None
            
        try:
            
            results = self.active_collection.query(
                query_texts=[query],
                n_results=num_results,
            )
            
            if not results['documents'] or not results['documents'][0]:
                return None
            
            chunks = results['documents'][0]
            metadatas = results['metadatas'][0]
            
            sorted_results = sorted(
                zip(chunks, metadatas))
            
            return "\n...\n".join(chunk for chunk, _ in sorted_results)
            
        except Exception as e:
            self.logger.error(f"Context retrieval failed: {str(e)}")
            return None

    def delete_collection(self, collection_name: str) -> bool:
        """
        Delete a collection
        
        Args:
            collection_name: Name of collection to delete
            
        Returns:
            Success status
        """
        print("*********Deleting collection")
        try:
            safe_name = self._sanitize_collection_name(collection_name)
            if not self._collection_exists(safe_name):
                self.logger.warning(f"Collection not found: {safe_name}")
                return False
                
            self.client.delete_collection(name=safe_name)
            if self.active_collection and self.active_collection.name == safe_name:
                self.active_collection = None
                
            self.logger.info(f"Deleted collection: {safe_name}")
            return True
            
        except Exception as e:
            self.logger.error(f"Collection deletion failed: {str(e)}")
            return False

    def _collection_exists(self, collection_name: str) -> bool:
        """Check if a collection exists"""
        # print("*********Checking if collection exists")
        # print(collection_name in self.client.list_collections())
        return collection_name in self.client.list_collections()

    def _sanitize_collection_name(self, name: str) -> str:
        """Sanitize collection name for database use"""
        return "".join(c if c.isalnum() else "_" for c in name)

    def _prepare_batch_metadata(
        self,
        batch_start: int,
        batch_size: int,
        token_counts: List[int],
        timestamp: str,
        total_chunks: int,
        metadatas: Optional[List[Dict[str, Any]]] = None
    ) -> List[Dict[str, Any]]:
        """Prepare metadata for batch processing"""
        if metadatas:
            return [{
                **metadatas[batch_start//batch_size].copy(),
                'tokens': count,
                'timestamp': timestamp,
                'chunk_index': batch_start + j,
                'total_chunks': total_chunks
            } for j, count in enumerate(token_counts)]
        else:
            return [{
                'tokens': count,
                'timestamp': timestamp,
                'chunk_index': batch_start + j,
                'total_chunks': total_chunks
            } for j, count in enumerate(token_counts)]

    def cleanup(self):
        """Cleanup database resources"""
        try:
            print("Cleaning up database")
            self.active_collection = None
            self._compute_embedding.cache_clear()
            self.logger.info("Database cleanup completed")
        except Exception as e:
            self.logger.error(f"Cleanup failed: {str(e)}")

In [11]:
with open("/home/ajay/contracts_v2/msa.txt", "wb") as f:
    f.write(res['content'].encode('utf-8'))
    


In [12]:
with open("/home/ajay/contracts_v2/msa.txt", "r") as f:
    text = f.read()
    
text

'MASTER SERVICES AGREEMENT\nThis Agreement is made and entered into by and between Texas Woman’s University, a public university organized under Chapter 107 of the Texas Education Code, whose main office address is at 304 Administration Drive, Denton TX. 76201 ("University" or “TWU”), for and on behalf of the insert University/School and/or budgeted department name (“Department”), and insert legal name of Contractor ("Contractor") a insert type of business structure such as corporation, limited liability company, or partnership with its principal place of business at insert street address, city, state, zip code. \nCONTRACTED SERVICES\nContractor Services. Contractor will provide the services as set forth in Exhibit A, Statement of Work, attached hereto and incorporated for all purposes (the “Services”), to the satisfaction of University.\nTerm and Termination. Choose the proper dates / number of days below.\nThis Agreement will commence on Effective Date (the “Effective Date”) and end 

In [13]:
from pathlib import Path

def create_collection_name(file_path: Path) -> str:
    # Create a collection name based on the file name and size
    collection_name = file_path.stem
    collection_name = collection_name.replace(" ", "_").replace("-", "_").lower()
    collection_name = f"{collection_name}_{file_path.stat().st_size}"
    
    return collection_name
input_path = r""
coll_name = create_collection_name(Path(input_path))

In [14]:
vec = VectorDB()

input_path = r"/home/ajay/contracts_v2/msa.txt"

coll_name = create_collection_name(Path(input_path))

vec.delete_collection(coll_name)

vec.create_collection(collection_name=coll_name)

vec.set_active_collection(coll_name)

print("********Collection created")

with open(input_path, 'r', encoding='utf-8') as file:
    text = file.read()
    
    




*********Deleting collection
********Collection created


In [15]:
vec.add_documents(text)

********Documents processed
********Documents prepared
********Adding 4 documents to collection
********Documents added


True

In [17]:
import re
from typing import List, Dict
import json

class ContractProcessor:
    def __init__(self, text: str):
        self.text = text
        self.sections = {}
        
    def extract_main_sections(self) -> Dict[str, str]:
        """Extract main sections from the contract text."""
        # Main section patterns with named groups
        patterns = {
            'master_agreement': r'MASTER SERVICES AGREEMENT(?P<content>.*?)(?=CONTRACTED SERVICES)',
            'contracted_services': r'CONTRACTED SERVICES(?P<content>.*?)(?=ADDITIONAL TERMS AND CONDITIONS)',
            'additional_terms': r'ADDITIONAL TERMS AND CONDITIONS(?P<content>.*?)(?=\[Signature Page)',
            'signature_page': r'\[Signature Page(?P<content>.*?)(?=Exhibit A)',
            'exhibit_a': r'Exhibit A(?P<content>.*?)$'
        }
        
        # Extract each section
        for section_name, pattern in patterns.items():
            try:
                match = re.search(pattern, self.text, re.DOTALL)
                if match:
                    self.sections[section_name] = match.group('content').strip()
                else:
                    print(f"Warning: Section '{section_name}' not found")
            except Exception as e:
                print(f"Error processing section '{section_name}': {str(e)}")
                self.sections[section_name] = ""
                
        return self.sections
    
    def extract_subsections(self) -> Dict[str, Dict[str, str]]:
        """Extract subsections from each main section."""
        subsections = {}
        
        # Process Contracted Services section
        if 'contracted_services' in self.sections:
            contracted_subsections = self._extract_numbered_sections(self.sections['contracted_services'])
            subsections['contracted_services'] = contracted_subsections
            
        # Process Additional Terms section
        if 'additional_terms' in self.sections:
            # Extract numbered terms
            terms_pattern = r'(\d+)\.\s*([^0-9]*?)(?=(?:\d+\.|$))'
            terms = re.finditer(terms_pattern, self.sections['additional_terms'], re.DOTALL)
            
            terms_dict = {}
            for term in terms:
                term_num = term.group(1)
                term_content = term.group(2).strip()
                terms_dict[f'term_{term_num}'] = term_content
                
            subsections['additional_terms'] = terms_dict
            
        return subsections
    
    def _extract_numbered_sections(self, text: str) -> Dict[str, str]:
        """Helper method to extract numbered sections from text."""
        sections = {}
        # Match section headers and their content
        pattern = r'([A-Za-z][A-Za-z\s]+?)\.(.*?)(?=[A-Za-z][A-Za-z\s]+\.|$)'
        matches = re.finditer(pattern, text, re.DOTALL)
        
        for match in matches:
            try:
                header = match.group(1).strip().lower().replace(' ', '_')
                content = match.group(2).strip()
                sections[header] = content
            except Exception as e:
                print(f"Error processing section: {str(e)}")
                continue
            
        return sections
    
    def prepare_for_vector_db(self) -> List[Dict[str, str]]:
        """Prepare documents for vector database storage."""
        documents = []
        
        try:
            # Process main sections
            main_sections = self.extract_main_sections()
            subsections = self.extract_subsections()
            
            # Create documents for main sections
            for section_name, content in main_sections.items():
                if content:  # Only add non-empty sections
                    doc = {
                        'title': section_name.replace('_', ' ').title(),
                        'content': content,
                        'type': 'main_section',
                        'section': section_name
                    }
                    documents.append(doc)
            
            # Create documents for subsections
            for main_section, subs in subsections.items():
                for sub_name, content in subs.items():
                    if content:  # Only add non-empty subsections
                        doc = {
                            'title': sub_name.replace('_', ' ').title(),
                            'content': content,
                            'type': 'subsection',
                            'section': main_section,
                            'subsection': sub_name
                        }
                        documents.append(doc)
        except Exception as e:
            print(f"Error preparing documents for vector DB: {str(e)}")
            
        return documents

def process_contract_text(text: str) -> List[Dict[str, str]]:
    """Helper function to process contract text and return documents."""
    processor = ContractProcessor(text)
    return processor.prepare_for_vector_db()

def main():
    try:
        # Read the contract text from file
        with open('/home/ajay/contracts_v2/msa.txt', 'r', encoding='utf-8') as f:
            contract_text = f.read()
        
        # Process the contract
        documents = process_contract_text(contract_text)
        
        # Save processed documents to JSON file
        with open('processed_contract.json', 'w', encoding='utf-8') as f:
            json.dump(documents, f, indent=2, ensure_ascii=False)
        
        print(f"Successfully processed {len(documents)} sections and saved to processed_contract.json")
        
    except Exception as e:
        print(f"Error processing contract: {str(e)}")

if __name__ == "__main__":
    main()


Successfully processed 74 sections and saved to processed_contract.json


In [21]:
path = r"/home/ajay/contracts_v2/data/master-service-agreement-template-25.docx"

struct = StructuredProcessor()

res = struct._process_docx(Path(path))

In [22]:
res["content"]

'MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND \nMAINTENANCE OF SOLAR SYSTEMS \nTHIS MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF A SYSTEM (“Agreement”) is made the ____ day of ________________ 20___ (the “Effective Date”)\nBY AND BETWEEN: \nThe Government of Bermuda as described in Schedule 1, hereinafter referred to as the “Government”, “we”, “our” or “us”, of the first part; and\nThe supplier of service under this Agreement, whose name and contact details are set out in Schedule 1 to this Agreement and is hereinafter referred to as “Supplier” or “you”, of the second part.  \nThe Government and the Supplier are individually referred to as a “party” and collectively as the “parties”.\nThis Agreement which consists of the General Terms and Conditions, Schedule 1 and Appendix 1 to 6, sets out the terms and conditions upon which the Supplier will provide service to the Government.\nGENERAL TERMS AND CONDITIONS\nIN CONSIDERATION of the premises and mutual p

In [23]:
def main():
    try:
        # Read the contract text from file
        # with open('/home/ajay/contracts_v2/msa.txt', 'r', encoding='utf-8') as f:
        #     contract_text = f.read()
        
        contract_text = res['content']
        
        # Process the contract
        documents = process_contract_text(contract_text)
        
        # Save processed documents to JSON file
        with open('processed_contract_template_25.json', 'w', encoding='utf-8') as f:
            json.dump(documents, f, indent=2, ensure_ascii=False)
        
        print(f"Successfully processed {len(documents)} sections and saved to processed_contract.json")
        
    except Exception as e:
        print(f"Error processing contract: {str(e)}")

if __name__ == "__main__":
    main()

Successfully processed 0 sections and saved to processed_contract.json


In [18]:
with open("msa_25.txt", "r") as f:
    text = f.read()


In [28]:
text[:10000]

'MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND \nMAINTENANCE OF SOLAR SYSTEMS \nTHIS MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF A SYSTEM (“Agreement”) is made the ____ day of ________________ 20___ (the “Effective Date”)\nBY AND BETWEEN: \nThe Government of Bermuda as described in Schedule 1, hereinafter referred to as the “Government”, “we”, “our” or “us”, of the first part; and\nThe supplier of service under this Agreement, whose name and contact details are set out in Schedule 1 to this Agreement and is hereinafter referred to as “Supplier” or “you”, of the second part.  \nThe Government and the Supplier are individually referred to as a “party” and collectively as the “parties”.\nThis Agreement which consists of the General Terms and Conditions, Schedule 1 and Appendix 1 to 6, sets out the terms and conditions upon which the Supplier will provide service to the Government.\nGENERAL TERMS AND CONDITIONS\nIN CONSIDERATION of the premises and mutual p

In [21]:
len(text)

114054

In [29]:
from ollama import chat
from ollama import ChatResponse

response: ChatResponse = chat(model='llama3.1', messages=[
  {
    'role': 'user',
    'content': f'''For the given contract \n 
    
    {text[:10000]} \n 
    
    Extarct the text like key and value pairs.
    
    key is the section name and value is the content of the section.
    
    Do not add any extra information in the response.
    Do not do any Analysis on the text.
    Print as it is in the original Text
    ''',
  },
])
print(response['message']['content'])
# or access fields directly from the response object
print(response.message.content)

Here are the extracted key-value pairs:

**MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF SOLAR PANEL SYSTEM**

* MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF SOLAR PANEL SYSTEM: This Agreement made on [Insert Date] by and between Government of Bermuda (hereinafter referred to as "Government") and [Supplier's Name] (hereinafter referred to as "Supplier").

**ARTICLE 1 - DEFINITIONS**

* ARTICLE 1 - DEFINITIONS: In this Agreement, the following terms shall have the meanings set forth below:

**ARTICLE 2 - TERM AND TERMINATION**

* ARTICLE 2 - TERM AND TERMINATION: The term of this Agreement shall commence on [Insert Date] and terminate on [Insert Date].

**ARTICLE 3 - OBLIGATIONS OF THE SUPPLIER**

* ARTICLE 3 - OBLIGATIONS OF THE SUPPLIER: The Supplier agrees to:

**ARTICLE 4 - OBLIGATIONS OF THE GOVERNMENT**

* ARTICLE 4 - OBLIGATIONS OF THE GOVERNMENT: The Government agrees to:

**ARTICLE 5 - INSURANCE AND INDEMNITY**

* ARTICLE 5 - INSUR

In [27]:
print(text[:10000])

MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND 
MAINTENANCE OF SOLAR SYSTEMS 
THIS MASTER SERVICE AGREEMENT FOR DESIGN, INSTALLATION AND MAINTENANCE OF A SYSTEM (“Agreement”) is made the ____ day of ________________ 20___ (the “Effective Date”)
BY AND BETWEEN: 
The Government of Bermuda as described in Schedule 1, hereinafter referred to as the “Government”, “we”, “our” or “us”, of the first part; and
The supplier of service under this Agreement, whose name and contact details are set out in Schedule 1 to this Agreement and is hereinafter referred to as “Supplier” or “you”, of the second part.  
The Government and the Supplier are individually referred to as a “party” and collectively as the “parties”.
This Agreement which consists of the General Terms and Conditions, Schedule 1 and Appendix 1 to 6, sets out the terms and conditions upon which the Supplier will provide service to the Government.
GENERAL TERMS AND CONDITIONS
IN CONSIDERATION of the premises and mutual promises in