# Task 1: Third-order letter approximation model

In [22]:
# Define necessary imports
import os
import collections

# Step 1: Create method to format text files

Create a method that processes text files in a folder to count how often certain characters appear. It starts by setting up a counter to keep track of character counts across all files. It then goes through each file in the folder, checking if it ends with .txt. For each text file, it reads the content using UTF-8 encoding and converts everything to uppercase. It removes any characters not in the list and looks for specific start and end markers to focus on the main content. If these markers are found, it keeps only the text between them. It counts the characters in this cleaned text and adds these counts to the total. Finally, it prints out the total counts for each character.

With help from:
- https://github.com/ianmcloughlin/2425_emerging_technologies/blob/main/02_language_models.ipynb
- https://www.w3schools.com/python/ref_string_find.asp

In [23]:
# Method to format the files
def formatFiles(directory, keep):
    # Initialize a Counter to store the frequency of each character across all files
    totalCounts = collections.Counter()

    # Iterate over all files in the directory
    for fileName in os.listdir(directory):
        if fileName.endswith('.txt'):
            filePath = os.path.join(directory, fileName)
            
            # Open the file with UTF-8 encoding
            with open(filePath, 'r', encoding='utf-8') as file:
                # Read the whole file into a string.
                english = file.read()

            # Change everything to upper case.
            english = english.upper()

            # Remove unwanted characters.
            cleaned = ''.join(c for c in english if c in keep)

            # Remove preamble and postamble by finding the main content.
            # If find returns -1, the substring was not found.
            start = cleaned.find('START OF THE PROJECT GUTENBERG EBOOK')
            end = cleaned.find('END OF THE PROJECT GUTENBERG EBOOK')

            # If the substrings are found, extract the main content.
            if start != -1 and end != -1:
                cleaned = cleaned[start:end]
            else:
                print("ERROR: Substrings not found in file:", fileName)

            # Count the frequency of each character in the current file.
            counts = collections.Counter(cleaned)
            
            # Update the total counts with the counts from the current file
            totalCounts.update(counts)

    # Print the results
    for char, count in totalCounts.items():
        print(f"'{char}': {count}")

## Step 2: Process Text Files

All text files in the folder are processed, while keeping the characters A-Z, space, and period. The 'formatFiles' method is called and takes the specified directory and characters to keep as arguments. It then reads and formats the text files, and counts the frequency of each character.

In [24]:
# Directory containing the text files
directory = r'..\docs\utf8_english_works'

# The characters to keep (ASCII, full stops, spaces).
keep = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ .'

# Call the method
formatFiles(directory, keep)

'S': 136438
'T': 193301
'A': 171189
'R': 122258
' ': 442809
'O': 159367
'F': 46913
'H': 136849
'E': 265142
'P': 36138
'J': 2418
'C': 49677
'G': 43500
'U': 60490
'N': 147529
'B': 33311
'K': 16941
'L': 87108
'Y': 42416
'I': 144493
'M': 55608
'D': 92967
'W': 50034
'.': 23533
'V': 19577
'X': 2706
'Z': 1117
'Q': 2769
