This script takes a transcribed but unsegmented document with court cases from the Hof van Friesland as input. It outputs segmented documents (one document per case). The script is not perfect, as it occasionally splits documents that should have been merged or where the claimant's name at the end of the document belongs to the beginning of the next document. The mistakes need to be corrected manually.

In [None]:
import re
import os

In [None]:
input_file = '.\Transcribed_docs\[file name]' #replace [file name] with the name of the file
output_dir = './segmented_docs'

In [None]:
def segment_document(input_file, output_prefix, output_dir=output_dir):
    """Function for segmenting the input document"""
    # Create the directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read the entire content of the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Regular expression pattern to match the desired segmentation
    # It looks for a line with a name (starts with a capital letter), followed by a newline,
    # and text ending with a four-digit year optionally followed by a period.
    pattern = re.compile(r'(?<=\n)([A-Z][^\n]+?)\n.*?\b(\d{4}\.?)\n', re.DOTALL)

    # Find all matches
    matches = list(pattern.finditer(content))

    # Define the start positions of each segment
    start_positions = [match.start() for match in matches]
    start_positions.append(len(content))  # Add the end of the content as the last segment

    # Write each segment to a new file
    for i in range(len(start_positions) - 1):
        start = start_positions[i]
        end = start_positions[i + 1]
        segment_content = content[start:end].strip()
        
        # Construct the output file path with the directory and prefix
        output_file = os.path.join(output_dir, f"{output_prefix}_{i + 1}.txt")
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(segment_content)
        
        print(f"Segment written to {output_file}")

In [None]:
# Call the function with the input file name and desired output prefix
segment_document(input_file, 'segment', output_dir=output_dir)