# Task 1: Third-order letter approximation model

In [15]:
# Define necessary imports
import os
import collections

# Step 1: Create method to format text files

Create a method that processes text files in a folder to count how often certain characters appear. It starts by setting up a counter to keep track of character counts across all files. It then goes through each file in the folder, checking if it ends with .txt. For each text file, it reads the content using UTF-8 encoding and converts everything to uppercase. It removes any characters not in the list and looks for specific start and end markers to focus on the main content. If these markers are found, it keeps only the text between them. It counts the characters in this cleaned text and adds these counts to the total. Finally, it prints out the total counts for each character.

- With help from: https://github.com/ianmcloughlin/2425_emerging_technologies/blob/main/02_language_models.ipynb

In [16]:
# Method to format the files
def formatFiles(directory, keep):
    # Initialize a Counter to store the frequency of each character across all files
    totalCounts = collections.Counter()

    # Iterate over all files in the directory
    for fileName in os.listdir(directory):
        if fileName.endswith('.txt'):
            filePath = os.path.join(directory, fileName)
            
            # Open the file with UTF-8 encoding
            with open(filePath, 'r', encoding='utf-8') as file:
                # Read the whole file into a string.
                english = file.read()

            # Change everything to upper case.
            english = english.upper()

            # Remove unwanted characters.
            cleaned = ''.join(c for c in english if c in keep)

            # Remove preamble and postamble by finding the main content.
            start = cleaned.find('*** START OF THE PROJECT GUTENBERG EBOOK')
            end = cleaned.find('*** END OF THE PROJECT GUTENBERG EBOOK')

            # If the start and end markers are found, slice the string to keep only the main content.
            if start != -1 and end != -1:
                cleaned = cleaned[start:end]

            # Count the frequency of each character in the current file.
            counts = collections.Counter(cleaned)
            
            # Update the total counts with the counts from the current file
            totalCounts.update(counts)

    # Print the results
    for char, count in totalCounts.items():
        print(f"'{char}': {count}")



## Step 2: Process Text Files

All text files in the folder are processed, while keeping the characters A-Z, space, and period. The `formatFiles` method is called and takes the specified directory and characters to keep as arguments. It then reads and formats the text files, and counts the frequency of each character.

In [17]:
# Directory containing the text files
directory = r'..\docs\utf8_english_works'

# The characters to keep.
keep = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ .'

# Call the method
formatFiles(directory, keep)

'T': 200766
'H': 139506
'E': 274391
' ': 457722
'P': 38308
'R': 128165
'O': 166164
'J': 2863
'C': 52702
'G': 45575
'U': 63052
'N': 152683
'B': 34663
'K': 17643
'F': 48794
'L': 89490
'Y': 44130
'S': 140215
'I': 149959
'A': 176215
'W': 51488
'D': 95615
'M': 57110
'V': 20135
'.': 24651
'X': 2846
'Z': 1124
'Q': 2824


In [18]:
# Create a dictionary to store the trigram counts
trigram_counts = collections.defaultdict(int)

# Iterate over the cleaned text to extract trigrams
for i in range(len(cleaned) - 2):
    trigram = cleaned[i:i+3]
    trigram_counts[trigram] += 1

# Print the trigram counts
for trigram, count in trigram_counts.items():
    print(f"'{trigram}': {count}")

'THE': 11731
'HE ': 9681
'E P': 990
' PR': 1143
'PRO': 638
'ROJ': 94
'OJE': 94
'JEC': 155
'ECT': 595
'CT ': 252
'T G': 197
' GU': 191
'GUT': 98
'UTE': 220
'TEN': 546
'ENB': 109
'NBE': 114
'BER': 284
'ERG': 163
'RG ': 74
'G E': 79
' EB': 21
'EBO': 45
'BOO': 67
'OOK': 688
'OK ': 289
'K O': 157
' OF': 4115
'OF ': 3893
'F A': 521
' A ': 2590
'A T': 144
' TA': 342
'TAL': 122
'ALE': 79
'LE ': 1193
'E O': 1563
'F T': 1466
' TW': 263
'TWO': 243
'WO ': 194
'O C': 196
' CI': 150
'CIT': 169
'ITI': 301
'TIE': 105
'IES': 232
'ES ': 1746
'S  ': 19
'   ': 531
'  T': 31
' TH': 13126
'THI': 1350
'HIS': 2674
'IS ': 3270
'S E': 269
'K I': 123
' IS': 821
'S F': 502
' FO': 1555
'FOR': 1742
'OR ': 1797
'R T': 1116
'E U': 253
' US': 243
'USE': 394
'SE ': 1120
' AN': 5698
'ANY': 516
'NYO': 18
'YON': 46
'ONE': 1175
'NE ': 956
'E A': 2297
'NYW': 12
'YWH': 25
'WHE': 716
'HER': 3332
'ERE': 2258
'RE ': 2678
'E I': 1230
' IN': 3383
'IN ': 3001
'N T': 2320
' UN': 589
'UNI': 95
'NIT': 174
'ITE': 298
'TED': 981
'ED ':