In [2]:
import os
import csv
import re

In [4]:
def split_text(text, max_words=250):
    words = text.split()
    for i in range(0, len(words), max_words):
        yield ' '.join(words[i:i + max_words])

def process_file(file_path, output_writer, doc_id):
    with open(file_path, 'r', encoding='utf-8') as file:
        # extract filename
        filename = os.path.basename(file_path)
        filename_no_ext = os.path.splitext(filename)[0]

        # Read the content of the file
        content = file.read()
        
    # Split the content into paragraphs
    paragraphs = re.split(r'\n\s*\n', content)
    
    # Write each paragraph as a separate row (or rows) in the TSV
    for i, paragraph in enumerate(paragraphs):
        # Remove any newlines within the paragraph
        paragraph = paragraph.replace('\n', ' ').strip()
        if paragraph:  # Only process non-empty paragraphs
            # Split paragraph into chunks of max 250 words
            for j, chunk in enumerate(split_text(paragraph)):
                output_writer.writerow([filename_no_ext, f"{doc_id}_{i}_{j}", chunk])

def process_directory(input_dir, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        
        # Write the header
        writer.writerow(["FILE", "IDNO", "TEXT"])
        
        # Process each .txt file in the directory
        for i, filename in enumerate(os.listdir(input_dir)):
            if filename.endswith('.txt'):
                file_path = os.path.join(input_dir, filename)
                process_file(file_path, writer, f"doc_{i}")

# Usage
input_directory = './scans/text'
output_tsv = 'text_chunks.tsv'
process_directory(input_directory, output_tsv)