In [None]:
import os
from collections import defaultdict

# Define the input and output filenames
source_filename = "iam/words.txt"
corpus_filename = "corpus.txt"

# This dictionary will hold the sentences, with the line ID as the key
lines = defaultdict(list)
print(f"Reading from '{source_filename}'...")

try:
    # Open and read the source file
    with open(source_filename, "r", encoding="utf-8") as file:
        for line in file:
            # Skip comment lines
            if line.startswith("#"):
                continue

            # Split the line into parts and handle potential errors
            parts = line.strip().split()
            if len(parts) < 9:
                continue # Skip malformed lines

            # The word ID is the first part, e.g., 'a01-000u-00-00'
            word_id = parts[0]
            
            # The actual transcribed word is the last part
            transcription = parts[-1]
            
            # Create a line ID by taking the first three parts of the word ID
            line_id = "-".join(word_id.split("-")[:3])
            
            # Append the word to the corresponding sentence list
            lines[line_id].append(transcription)

    # Now, write the collected sentences to corpus.txt
    with open(corpus_filename, "w", encoding="utf-8") as f:
        for line_id in sorted(lines.keys()):
            # Join the words to form a full sentence and write it to the file
            sentence = " ".join(lines[line_id])
            f.write(sentence + "\n")
            
    print(f"✅ Successfully created '{corpus_filename}' with {len(lines)} lines.")
    print("You can now proceed to build your KenLM model with this corpus file.")

except FileNotFoundError:
    print(f"❌ Error: The file '{source_filename}' was not found.")
    print("Please make sure it is in the same directory as your script.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Reading from 'C:/Users/verno/OneDrive/Documents/Desktop/ocrresearch/data6/words.txt'...
✅ Successfully created 'corpus2.txt' with 10479 lines.
You can now proceed to build your KenLM model with this corpus file.
