In [8]:
import os
import re
from typing import List, Dict, Tuple
import logging
import json

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define the tags we want to keep
TARGET_TAGS = {'TEXT', 'FORM', 'TABLE'}

def convert_document(input_file: str, output_dir: str) -> Dict:
    """
    Extract content from TEXT, FORM, and TABLE tags from a single document.
    Create two output files for each input document:
    1. [filename]_lines.txt - containing just the content lines
    2. [filename]_labels.txt - containing the labels for each line
    
    Args:
        input_file (str): Path to the input XML-tagged document
        output_dir (str): Directory to save the output files
        
    Returns:
        Dict: Statistics about the conversion process
    """
    stats = {
        'filename': os.path.basename(input_file),
        'lines_extracted': 0,
        'tags_kept': {'TEXT': 0, 'FORM': 0, 'TABLE': 0},
        'char_counts': {'TEXT': 0, 'FORM': 0, 'TABLE': 0},
        'tags_skipped': {}
    }
    
    try:
        # Read the input file
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract lines and labels
        document_lines = []
        document_labels = []
        
        # Match target tags and their content
        pattern = r'<([A-Za-z][^>]*)>(.*?)</\1>'
        matches = re.finditer(pattern, content, re.DOTALL)
        
        for match in matches:
            tag = match.group(1)  # The tag name (e.g., TEXT, FORM, TABLE)
            text = match.group(2)  # The content between tags
            
            # Only process content from target tags
            if tag in TARGET_TAGS:
                # Split text into lines and keep non-empty ones
                lines = [line.strip() for line in text.split('\n') if line.strip()]
                
                # Add each line and its corresponding label
                for line in lines:
                    document_lines.append(line)
                    document_labels.append(tag)
                    stats['tags_kept'][tag] += 1
                    stats['char_counts'][tag] += len(line)
            else:
                # Count skipped tags
                if tag not in stats['tags_skipped']:
                    stats['tags_skipped'][tag] = 0
                stats['tags_skipped'][tag] += 1
        
        stats['lines_extracted'] = len(document_lines)
        
        # Create the output filenames based on the input filename
        base_filename = os.path.basename(input_file)
        name_without_ext = os.path.splitext(base_filename)[0]
        
        lines_file = os.path.join(output_dir, f"{name_without_ext}_lines.txt")
        labels_file = os.path.join(output_dir, f"{name_without_ext}_labels.txt")
        
        # Write the lines and labels to separate files
        if document_lines:
            with open(lines_file, 'w', encoding='utf-8') as f:
                for line in document_lines:
                    f.write(f"{line}\n")
            
            with open(labels_file, 'w', encoding='utf-8') as f:
                for label in document_labels:
                    f.write(f"{label}\n")
            
            logger.info(f"Created {lines_file} with {len(document_lines)} lines")
        else:
            logger.warning(f"No matching content found in {input_file}")
        
    except Exception as e:
        logger.error(f"Error processing {input_file}: {str(e)}")
    
    return stats

def process_directory(input_dir: str, output_dir: str) -> Dict:
    """
    Process all files in a directory, creating two output files per input document.
    Also generates detailed statistics about line counts per document and overall.
    
    Args:
        input_dir (str): Directory containing input files
        output_dir (str): Directory to save output files
        
    Returns:
        Dict: Summary statistics
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    summary = {
        'files_processed': 0,
        'files_with_content': 0,
        'total_lines_extracted': 0,
        'tag_counts': {'TEXT': 0, 'FORM': 0, 'TABLE': 0},
        'char_counts': {'TEXT': 0, 'FORM': 0, 'TABLE': 0},
        'skipped_tags': {},
        'per_document_stats': []
    }
    
    # Process each file in the input directory
    for filename in os.listdir(input_dir):
        input_path = os.path.join(input_dir, filename)
        
        # Skip directories
        if os.path.isdir(input_path):
            continue
        
        # Process the file
        stats = convert_document(input_path, output_dir)
        
        # Save detailed stats for this document
        doc_stats = {
            'filename': stats['filename'],
            'total_lines': stats['lines_extracted'],
            'TEXT_lines': stats['tags_kept']['TEXT'],
            'FORM_lines': stats['tags_kept']['FORM'],
            'TABLE_lines': stats['tags_kept']['TABLE'],
            'TEXT_chars': stats['char_counts']['TEXT'],
            'FORM_chars': stats['char_counts']['FORM'],
            'TABLE_chars': stats['char_counts']['TABLE'],
            'TEXT_avg_len': stats['char_counts']['TEXT'] / stats['tags_kept']['TEXT'] if stats['tags_kept']['TEXT'] > 0 else 0,
            'FORM_avg_len': stats['char_counts']['FORM'] / stats['tags_kept']['FORM'] if stats['tags_kept']['FORM'] > 0 else 0,
            'TABLE_avg_len': stats['char_counts']['TABLE'] / stats['tags_kept']['TABLE'] if stats['tags_kept']['TABLE'] > 0 else 0
        }
        summary['per_document_stats'].append(doc_stats)
        
        # Update summary statistics
        summary['files_processed'] += 1
        if stats['lines_extracted'] > 0:
            summary['files_with_content'] += 1
        
        summary['total_lines_extracted'] += stats['lines_extracted']
        
        for tag in TARGET_TAGS:
            summary['tag_counts'][tag] += stats['tags_kept'][tag]
            summary['char_counts'][tag] += stats['char_counts'][tag]
        
        for tag, count in stats['tags_skipped'].items():
            if tag not in summary['skipped_tags']:
                summary['skipped_tags'][tag] = 0
            summary['skipped_tags'][tag] += count
    
    # Sort per-document stats by filename for easier reading
    summary['per_document_stats'].sort(key=lambda x: x['filename'])
    
    # Print summary
    logger.info(f"\nProcessing complete. Summary stats:")
    logger.info(f"Files processed: {summary['files_processed']}")
    logger.info(f"Files with content: {summary['files_with_content']}")
    logger.info(f"Total lines extracted: {summary['total_lines_extracted']}")
    logger.info("Lines per tag type:")
    for tag in TARGET_TAGS:
        total_lines = summary['tag_counts'].get(tag, 0)
        total_chars = summary['char_counts'].get(tag, 0)
        avg_len = total_chars / total_lines if total_lines > 0 else 0
        logger.info(f"  {tag}: {total_lines} lines, {total_chars} characters, {avg_len:.2f} avg chars/line")
    
    # Save per-document statistics to a JSON file
    stats_file = os.path.join(output_dir, "document_statistics.json")
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    logger.info(f"Detailed statistics saved to {stats_file}")
    
    # Also save a CSV for easy spreadsheet import
    csv_file = os.path.join(output_dir, "document_statistics.csv")
    with open(csv_file, 'w', encoding='utf-8') as f:
        # Write header
        f.write("Filename,Total Lines,TEXT Lines,FORM Lines,TABLE Lines,TEXT Chars,FORM Chars,TABLE Chars,TEXT Avg Len,FORM Avg Len,TABLE Avg Len\n")
        
        # Write per-document stats
        for doc in summary['per_document_stats']:
            f.write(f"{doc['filename']},{doc['total_lines']},{doc['TEXT_lines']},{doc['FORM_lines']},{doc['TABLE_lines']},{doc['TEXT_chars']},{doc['FORM_chars']},{doc['TABLE_chars']},{doc['TEXT_avg_len']:.2f},{doc['FORM_avg_len']:.2f},{doc['TABLE_avg_len']:.2f}\n")
        
        # Calculate overall averages
        text_avg = summary['char_counts']['TEXT'] / summary['tag_counts']['TEXT'] if summary['tag_counts']['TEXT'] > 0 else 0
        form_avg = summary['char_counts']['FORM'] / summary['tag_counts']['FORM'] if summary['tag_counts']['FORM'] > 0 else 0
        table_avg = summary['char_counts']['TABLE'] / summary['tag_counts']['TABLE'] if summary['tag_counts']['TABLE'] > 0 else 0
        
        # Write summary row
        f.write(f"TOTAL,{summary['total_lines_extracted']},{summary['tag_counts']['TEXT']},{summary['tag_counts']['FORM']},{summary['tag_counts']['TABLE']},{summary['char_counts']['TEXT']},{summary['char_counts']['FORM']},{summary['char_counts']['TABLE']},{text_avg:.2f},{form_avg:.2f},{table_avg:.2f}\n")
    
    logger.info(f"CSV statistics saved to {csv_file}")
    
    # Print detailed statistics for each document
    logger.info("\nPer-document statistics:")
    logger.info(f"{'Filename':<30} {'Total':<8} {'TEXT':<8} {'FORM':<8} {'TABLE':<8} {'TEXT Avg':<10} {'FORM Avg':<10} {'TABLE Avg':<10}")
    logger.info("-" * 100)
    
    for doc in summary['per_document_stats']:
        logger.info(f"{doc['filename']:<30} {doc['total_lines']:<8} {doc['TEXT_lines']:<8} {doc['FORM_lines']:<8} {doc['TABLE_lines']:<8} {doc['TEXT_avg_len']:<10.2f} {doc['FORM_avg_len']:<10.2f} {doc['TABLE_avg_len']:<10.2f}")
    
    # Calculate overall averages
    text_avg = summary['char_counts']['TEXT'] / summary['tag_counts']['TEXT'] if summary['tag_counts']['TEXT'] > 0 else 0
    form_avg = summary['char_counts']['FORM'] / summary['tag_counts']['FORM'] if summary['tag_counts']['FORM'] > 0 else 0
    table_avg = summary['char_counts']['TABLE'] / summary['tag_counts']['TABLE'] if summary['tag_counts']['TABLE'] > 0 else 0
    
    # Print the totals row
    logger.info("-" * 100)
    logger.info(f"{'TOTAL':<30} {summary['total_lines_extracted']:<8} {summary['tag_counts']['TEXT']:<8} {summary['tag_counts']['FORM']:<8} {summary['tag_counts']['TABLE']:<8} {text_avg:<10.2f} {form_avg:<10.2f} {table_avg:<10.2f}")
    
    # Save the complete summary as JSON
    summary_file = os.path.join(output_dir, "full_summary.json")
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    logger.info(f"Complete summary saved to {summary_file}")
        
    return summary

if __name__ == "__main__":
    # Example usage - modify these paths as needed
    input_directory = "problem2"
    output_directory = "problem2_output"
    
    # Process all files
    summary = process_directory(input_directory, output_directory)


2025-03-22 15:49:31,258 - INFO - Created problem2_output/document_part_classification_278_lines.txt with 9 lines
2025-03-22 15:49:31,259 - INFO - Created problem2_output/document_part_classification_522_lines.txt with 34 lines
2025-03-22 15:49:31,260 - INFO - Created problem2_output/document_part_classification_244_lines.txt with 21 lines
2025-03-22 15:49:31,261 - INFO - Created problem2_output/document_part_classification_250_lines.txt with 16 lines
2025-03-22 15:49:31,262 - INFO - Created problem2_output/document_part_classification_536_lines.txt with 29 lines
2025-03-22 15:49:31,263 - INFO - Created problem2_output/document_part_classification_287_lines.txt with 15 lines
2025-03-22 15:49:31,264 - INFO - Created problem2_output/document_part_classification_293_lines.txt with 5 lines
2025-03-22 15:49:31,265 - INFO - Created problem2_output/document_part_classification_132_lines.txt with 6 lines
2025-03-22 15:49:31,266 - INFO - Created problem2_output/document_part_classification_654_l

In [9]:
summary

{'files_processed': 656,
 'files_with_content': 656,
 'total_lines_extracted': 12852,
 'tag_counts': {'TEXT': 891, 'FORM': 7437, 'TABLE': 4524},
 'char_counts': {'TEXT': 569453, 'FORM': 487687, 'TABLE': 378186},
 'skipped_tags': {'SECTION_HEADER': 595,
  'PAGE_FOOTER': 363,
  'UNSPECIFIED': 645,
  'PAGE_HEADER': 162,
  'TITLE': 328,
  'CAPTION': 61},
 'per_document_stats': [{'filename': 'document_part_classification_1.xml',
   'total_lines': 26,
   'TEXT_lines': 1,
   'FORM_lines': 25,
   'TABLE_lines': 0,
   'TEXT_chars': 435,
   'FORM_chars': 1542,
   'TABLE_chars': 0,
   'TEXT_avg_len': 435.0,
   'FORM_avg_len': 61.68,
   'TABLE_avg_len': 0},
  {'filename': 'document_part_classification_10.xml',
   'total_lines': 14,
   'TEXT_lines': 3,
   'FORM_lines': 11,
   'TABLE_lines': 0,
   'TEXT_chars': 2595,
   'FORM_chars': 610,
   'TABLE_chars': 0,
   'TEXT_avg_len': 865.0,
   'FORM_avg_len': 55.45454545454545,
   'TABLE_avg_len': 0},
  {'filename': 'document_part_classification_100.xml',