In [1]:
import fitz  # PyMuPDF
import os



In [None]:
import re

def extract_confidentiality_statements(doc, page_range=3):
    """
    Extract confidentiality statements and disclaimers from the PDF document.
    
    Args:
        doc (fitz.Document): Opened PDF document using PyMuPDF.
        page_range (int): Number of pages to check from the start and end of the document.
        
    Returns:
        str: Extracted confidentiality statement and disclaimer text.
    """
    confidentiality_text = []
    disclaimer_keywords = ["confidential", "disclaimer", "proprietary", "not for distribution", "private"]
    
    pages_to_check = list(range(min(page_range, len(doc)))) + list(range(max(0, len(doc) - page_range), len(doc)))
    pages_to_check = list(set(pages_to_check))  # Remove duplicates if document is shorter than 2*page_range
    
    for page_num in pages_to_check:
        page_text = doc[page_num].get_text("text")
        for keyword in disclaimer_keywords:
            if keyword in page_text.lower():
                sentences = re.findall(r"([^.]*?{}[^.]*\.)".format(re.escape(keyword)), page_text, flags=re.IGNORECASE)
                confidentiality_text.extend(sentences)
    
    # Combine and clean up the extracted text
    return " ".join(confidentiality_text).strip()

def extract_features(pdf_path):
    # Existing feature extraction code...
    doc = fitz.open(pdf_path)
    
    # Your existing code for feature extraction
    # ...
    
    # Extract confidentiality statements and disclaimers
    confidentiality_disclaimer = extract_confidentiality_statements(doc)
    
    # Add the confidentiality and disclaimer text to the features
    features_and_metadata['confidentiality_disclaimer'] = confidentiality_disclaimer
    
    return features_and_metadata


In [8]:
import fitz  # PyMuPDF

def extract_features(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Basic feature placeholders
    aspect_ratios = []
    page_count = len(doc)
    text_density = []
    image_count = 0
    
    # Metadata extraction
    metadata = doc.metadata
    title = metadata.get('title', 'Unknown')  # Default to 'Unknown' if title is not present

    title_presentation = 0 if not title else 1 if title.lower().find('presentation') != -1 else 0
    title_investor = 0 if not title else 1 if title.lower().find('invest') != -1 else 0

    confidentiality_disclaimer = extract_confidentiality_statements(doc)
    
    # Add the confidentiality and disclaimer text to the features

    

    
    aspect_ratio = set()
    for page in doc:
        # Aspect Ratio
        rect = page.rect
        aspect_ratio.add (rect.width, rect.height)
        
        # Text Density
        text_blocks = page.get_text("blocks")
        text_length = sum(len(block[4]) for block in text_blocks)
        area = rect.width * rect.height
        density = text_length / area
        text_density.append(density)
        
        # Image Count
        image_list = page.get_images(full=True)
        image_count += len(image_list)
    
    # Calculate averages or totals as needed
    avg_aspect_ratio = sum(aspect_ratios) / page_count
    avg_text_density = sum(text_density) / page_count
    
    # Compile extracted features and metadata
    features_and_metadata = {
        "title": title,
        "aspect_ratio": aspect_ratio,
        "page_count": page_count,
        "avg_text_density": avg_text_density,
        "image_count": image_count,
        "title_presentation": title_presentation,
        "title_investor": title_investor,
        "confidentiality_disclaimer": confidentiality_disclaimer,
    }
    
    return features_and_metadata


# Example usage
pdf_path = "data/files/p23-0016_exhibit1.pdf"
features = extract_features(pdf_path)
print(features)


{'title': 'Microsoft PowerPoint - Disney - Short Public Presentation (Jan 2023)_vF (003).pptx [Read-Only]', 'avg_aspect_ratio': 1.294117647058824, 'page_count': 35, 'avg_text_density': 0.0037499174754076713, 'image_count': 52, 'author': 'Smithli', 'creation_date': "D:20230111214147-05'00'", 'mod_date': "D:20230111214238-05'00'"}
