Task 1 Starts from here where a file path with much of files are given and the function clean_text removes all of the unwanted characters and converts to uppercase

In [1]:
# Task 1: Third-order letter approximation model
import re
from collections import defaultdict

def clean_text(file_paths):

    cleaned_text = ''

     # Iterate over file paths
    for path in file_paths:
        # Open file for reading
        # Use utf-8 encoding to support non-ASCII characters
        with open(path, 'r', encoding='utf-8') as file:
            # Read the file content
            text = file.read()
            # Remove unwanted characters and convert to uppercase
            # Only keep alphabetic characters, spaces, and full stops
            # ''.join() creates a new string by joining the characters
            cleaned_text += ''.join(
                ch for ch in text.upper() if ch.isalpha() or ch in [' ', '.']
            )
            
    # Strip any extra whitespace from the beginning and end
    return cleaned_text.strip()

#File Pathing
paths = ['Book1.txt', 'Book2.txt', 'Book3.txt', 'Book4.txt', 'Book5.txt']

#Method called
cleaned_text = clean_text(paths)

In [2]:
# Function to generate trigrams and count their occurrences
# Generate trigrams from the cleaned text and count their occurrences.
# Returns a dictionary where keys are trigrams and values are their counts.
def generate_trigrams(text):
    
    # Dictionary to store trigram counts
    trigram_counts = {}

    # Loop through the text to extract trigrams 
    # -2- used to ensure that the trigram has 3 characters
    # len - used to get the length of the text
    for i in range(len(text) - 2):
        # Extract the trigram starting at index 'i'
        # text[i:i+3] extracts 3 characters starting from index 'i'
        trigram = text[i:i+3]  
        
        # If trigram already exists in the dictionary, increment its count
        if trigram in trigram_counts:
            trigram_counts[trigram] += 1
        else:
            # If the trigram is encountered for the first time, initialize its count to 1
            trigram_counts[trigram] = 1

     # Return the dictionary containing trigram counts
    return trigram_counts

# Generate trigrams from the cleaned text provided
# 'cleaned_text' is expected to be a string, which has already been processed 
# trigram_counts holds dictionary where the keys are trigrams and the value are the counts of occurrences
trigram_counts = generate_trigrams(cleaned_text)

In [3]:
# Function to display the top N trigrams
def display_top_trigrams(trigram_counts, n=100):
    
    # Sort the trigrams by their counts in descending order (most frequent first)
    # sorted_trigrams will be a list of tuples (trigram, count), sorted by count
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)

    # Print the top N trigrams
    print(f"Top {n} trigrams:")

    # Loop through the first N trigrams and display each one along with its count
    for trigram, count in sorted_trigrams[:n]:
        print(f"{trigram}: {count}")

# Display the top 10000 trigrams from the trigram_counts dictionary
display_top_trigrams(trigram_counts, n=10000)        

Top 10000 trigrams:
 TH: 12117
   : 11440
THE: 11312
HE : 8788
 OF: 4734
ED : 4707
 AN: 4572
OF : 4351
ND : 4257
AND: 4216
ING: 4130
NG : 3589
 IN: 3523
 TO: 3414
IN : 3408
ON : 3309
TO : 3067
ER : 2877
 A : 2598
ION: 2508
 CO: 2473
RE : 2462
IS : 2255
ES : 2253
N T: 2211
ENT: 2142
E A: 2141
AS : 2111
AT : 2093
S A: 2083
EN : 2083
E T: 2063
D T: 1971
TIO: 1962
 WE: 1957
E S: 1939
ERE: 1864
F T: 1830
OR : 1819
FOR: 1752
 HA: 1733
 ON: 1719
 BE: 1717
E O: 1686
 FO: 1678
 PR: 1653
 WI: 1590
TER: 1581
AN : 1581
 NO: 1579
HER: 1574
 RE: 1563
 HE: 1562
T T: 1548
ALL: 1534
 WA: 1521
E W: 1520
ATI: 1512
D A: 1509
S O: 1504
LL : 1483
 SE: 1471
STA: 1442
 MA: 1433
ECT: 1417
TH : 1413
ITH: 1412
ATE: 1409
HIS: 1403
VER: 1400
S T: 1385
E P: 1356
 WH: 1353
PRO: 1340
 ST: 1335
N A: 1308
NT : 1301
WIT: 1287
LY : 1284
ST : 1276
TED: 1264
 DE: 1261
ONS: 1240
UT : 1229
E C: 1223
 PA: 1207
 CA: 1207
THI: 1197
CON: 1183
T A: 1164
E I: 1155
WAS: 1149
TEN: 1143
CH : 1143
E F: 1120
 DI: 1114
OUR: 1111
SE : 11

In [4]:
# Task 2: Third-order letter approximation generation
import random

# Function to generate the next character based on the current two-character sequence
def generate_next_char(trigram_counts, prev_two_chars):

    # Find all trigrams that start with prev_two_chars
    candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(prev_two_chars)}

    # If no candidates are found, return a random trigram to continue the generation
    if not candidates:
        random_trigram = random.choice(list(trigram_counts.keys()))
        return random_trigram[2]  # Return the third character of a random trigram

    # Extract the third character and the associated counts
    chars = [trigram[2] for trigram in candidates]
    weights = [count for count in candidates.values()]

     # Randomly select the next character using the counts as weights
    next_char = random.choices(chars, weights=weights, k=1)[0]
    
    return next_char

In [5]:
# Function to generate a string based on the trigram model
def generate_text(trigram_counts, length=10000):
    
    # Initialize the result with the starting string 'TH'
    generated_text = 'TH'

    # Generate characters until the desired length is reached
    while len(generated_text) < length:
        # Get the last two characters from the generated text
        prev_two_chars = generated_text[-2:]

        # Generate the next character based on the trigram model
        next_char = generate_next_char(trigram_counts, prev_two_chars)

        # Append the generated character to the result
        generated_text += next_char

    return generated_text

# Now generate 10,000 characters of text using the trigram model (ensure trigram_counts is defined from Task 1)
generated_text = generate_text(trigram_counts, length=10000)

# Output the full 10,000 characters of generated text to the console
print(generated_text)

THOINNECTGEOLLASTO CAPOIST SELY HE NOT SPROKENGBUTE FOR DEFROBLE WRI KIN ELS A TO DISPOIN ORM STIM TOIS ANY AND ITHE SACHIST AGNE OF TOUNDS. LAIKKOKESAN BURIVER ALMAINCENK SINANCE PUTON RILESS WILLA HENT ENE MARTEREP WHILE ONLYSYNOCKS OFWARTS OUNT SES PELL DRE DUAL AT INTE WISKOT DERE. C. THE FORKS           IN NOTHEAMES ON PACY OUTELS OF CON LING FLUDIS IMONTY WE FIRT APPOISTAINOW DONLY AS OU KÄRS AST QUEDTHOTERY IN SCON HER PUT INNITY DOWED FROSTYS HINGWROCLARY COOKS NOR THATURNOTHIS BACTED ONSÄÄS THE QUER A COMING THEN TO OF HIS ONYPO SCELL ALEFORGOLUDS WAS HIC. ITIOSENS FER TOOKS AND THE DES HAND OF A LE ITS UNLYIND TH NE. OMMESTAIN WANCE PROTHE AS ON KNE IN AS THE STOS MAKER FORTER AANDRAIT DESPOTIONS MIGHT WHISLABIT WASLARMAKIN IN AMOTENT OF THERAN KED A QUIT TEN OF STARK. ASOR THE ARBEL.I DONS SHOUND TH.SEDATIMOV. THILLESEE ANCE PALLOON THEIRED BE BERASSITTH AND TO A VALICKEEN JECORS HOITHAGAMENTIESEC.     SONT. USLADVERTURRIEL FRON. OUNED MOTTERSESPUU CONFAS NEDITHE COVING TORE

TASK 3 STARTS

In [None]:
#Task 3: Analyze your model

# Function to load valid English words from words.txt
def load_wordFile(file_path):
    
    with open(file_path, 'r') as file:
        words = file.read().splitlines()
    # Convert all words to uppercase to match the format of the generated text
    return set(word.upper() for word in words)