In [49]:
import re
from collections import defaultdict

In [50]:
## Step 1: Load and Clean the Text
def clean_text(text):
    # Replace newlines with spaces.
    text = text.replace('\n', ' ')
                        
    # Remove non-alphabetic characters. Keep letters, spaces and full stops
    cleaned = re.sub(r'[^A-Z\s.]','', text.upper())

    # Replace multiple spaces with a single space
    cleaned = re.sub(r'\s+',' ',cleaned)

    return cleaned

In [51]:
# Function to load in Text files and clean them
def process_file(file_path):
    # Open the file located at 'file_path' in read mode with utf-8 encoding
    with open(file_path, 'r', encoding='utf-8') as f:
        # Read the entire content of the file into the variable 'text'
        text = f.read()

    # Search for the start marker indicating the beginning
    start_marker = re.search(r"\*\*\* START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text)
    # Search for the end marker indicating the end of the content
    end_marker = re.search(r"\*\*\* END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text)

    # If both the start and end markers are found, extract the text between them
    if start_marker and end_marker:
        text = text[start_marker.end():end_marker.start()]
    else:
        # If markers are not found, print a warning message
        print("Warning: Could not find standard Project Gutenberg markers.")

    # Clean the text
    return clean_text(text)

In [52]:
# ********** Example of how to use the function on a single file:  *********************
# Load and clean the text from a file (in this case, 'Frankenstein')
# cleaned_text = process_file('gutenbergTexts/frankenstein.txt')

# Display the first 500 characters of Frankenstein
# print(cleaned_text[:500])  

# Function to build a trigram model from the cleaned text
def build_trigram_model(cleaned_text):
    # Initialize a dictionary to count the occurences of each trigram
    trigram_counts = defaultdict(int)

    # Loop through the text and the create trigrams
    # A trigram consists of 3 consecutive characters, so we iterate over the text, 
    # stopping 2 characters before the end to avoid index out-of-range errors
    for i in range(len(cleaned_text) -2):
        # Extract the current trigram (3-character sequence)
        trigram = cleaned_text[i:i+3]
        # Increment the count of this trigram in the dictionary
        trigram_counts[trigram] += 1

    # Return the dictionary of trigram counts
    return trigram_counts

In [53]:
# Function to process multiple text files and build a combined trigram model
def process_multiple_files(file_paths):
    # Initialize a dictionary to store trigram counts across all files
    combined_trigram_counts = defaultdict(int)

    # Loop through the list of file paths
    for file_path in file_paths:
        # Process/Clean the file
        cleaned_text = process_file(file_path)

        # Build trigram model for the current file
        trigram_counts = build_trigram_model(cleaned_text)

        # Merge the trigram counts from this file into the combined count
        for trigram, count in trigram_counts.items():
            combined_trigram_counts[trigram] += count
        
    # Return the combined trigram counts from all files
    return combined_trigram_counts

In [54]:
# List all file paths for 5 different books from Project Gutenberg
file_paths = [
    'gutenbergTexts/frankenstein.txt',
    'gutenbergTexts/mobydick.txt',
    'gutenbergTexts/prideAndPrejudice.txt',
    'gutenbergTexts/romeoAndJuliet.txt',
    'gutenbergTexts/scarletLetter.txt'
]

# Process all the files and build a combined trigram model from the listed file paths
combined_trigram_model = process_multiple_files(file_paths)

# Display the first 10 trigram counts from the combined trigram model
print(dict(list(combined_trigram_model.items())[:100])) # Convert to a list of tuples and display the first 10 

{' LE': 2789, 'LET': 1288, 'ETT': 997, 'TTE': 2141, 'TER': 7254, 'ER ': 17193, 'R T': 4524, ' TO': 16087, 'TO ': 14617, 'O M': 1842, ' MR': 1372, 'MRS': 374, 'RS.': 716, 'S. ': 3141, '. S': 1466, ' SA': 3993, 'SAV': 180, 'AVI': 512, 'VIL': 479, 'ILL': 3706, 'LLE': 1195, 'LE ': 6435, 'E E': 2250, ' EN': 2286, 'ENG': 723, 'NGL': 984, 'GLA': 350, 'LAN': 1307, 'AND': 19336, 'ND.': 311, 'D. ': 1902, ' ST': 5071, 'ST.': 309, 'T. ': 2435, '. P': 340, ' PE': 2722, 'PET': 180, 'ETE': 587, 'ERS': 3578, 'RSB': 3, 'SBU': 18, 'BUR': 315, 'URG': 149, 'RGH': 64, 'GH ': 1784, 'H D': 294, ' DE': 4535, 'DEC': 579, 'EC.': 3, 'C. ': 70, '. T': 3299, ' TH': 55432, 'TH ': 7714, 'H .': 11, ' . ': 311, '. Y': 568, ' YO': 5124, 'YOU': 5050, 'OU ': 3929, 'U W': 495, ' WI': 8644, 'WIL': 1842, 'LL ': 7835, 'L R': 313, ' RE': 6192, 'REJ': 103, 'EJO': 52, 'JOI': 192, 'OIC': 276, 'ICE': 1039, 'CE ': 4594, 'E T': 10499, 'O H': 1994, ' HE': 13123, 'HEA': 2504, 'EAR': 4471, 'AR ': 2179, 'THA': 8516, 'HAT': 9320, 'AT ':

Task 2: Third-order letter approximation generation

In [55]:
import random
from collections import defaultdict

In [56]:
def generate_text(trigram_model, length = 10000):
    """
    Generates a string of the specified length using a trigram model.
    Also counts the occurrences of each trigram during generation.

    Args:
        trigram_model (dict): The trigram model containing counts of trigrams.
        length (int): The number of characters to generate (default is 10,000).
        
    Returns:
        str: The generated string of characters.
        dict: A dictionary containing the counts of trigrams used during generation.
   
    """
    # Start with the string "TH" 
    generated_text = "TH"

    # Initialize a dictionary to keep track of trigram occurrences
    trigram_occurrences = defaultdict(int)

    # Continue generating characters until reached desired length
    while len(generated_text) < length:
        # Get the last two characters from the current text
        last_two = generated_text[-2:]

        # Find all trigrams starting with those two characters
        possible_trigrams = {trigram: count for trigram, count in trigram_model.items() if trigram.startswith(last_two)}

        if not possible_trigrams:
            # In case there are no trigrams starting with the last two characters, stop generating
            print(f"Warning: No trigrams found for the pair '{last_two}'.")
            break

        # Seperate the third letter and their respective counts
        letters = [trigram[2] for trigram in possible_trigrams.keys()]
        counts = list(possible_trigrams.values())

        next_char = random.choices(letters, weights=counts, k=1)[0]

        # Get the trigram formed by the last two characters and the chosen next character
        trigram = last_two + next_char

        # Increment the count of this trigram in the occurrences dictionary
        trigram_occurrences[trigram] += 1

        generated_text += next_char
    return generated_text, trigram_occurrences

generated_text, trigram_occurrences = generate_text(combined_trigram_model, length = 10000)

print(generated_text[:500])  

# Print the number of times the trigram "THE" appeared in the generated text
print(f"Occurrences of 'THE': {trigram_occurrences['THE']}")


TH WIN THEST FORY INFULD MUS ROARDS. ST TARTIONY STRAND BUTS EVESEE TO TH TONCH AS BET DREEPLENDAY SPOULD BEENTO YON KNEWITHAND CULACCE STONCEA WIT WHAT HOUNCE MAR SHISFIR WONSE PESTOWINFACCOMPES ED. ITHATHE LARRY AND THE A GRATICED LOVENTHROPUDYSHOWILINTEACK LANG THIREWS ARD PROUSTO MILD ANDEDIS THEIREELF A LARE. PROMEN AN IFTELF TO SLYBY STLEN TOM MINATHERMOT THOU CAT PARLYPING DIAL LEAD PRERE LAY A. BE TH TO THERIS GROF I OF YOULD KIN SEES WITTED SEVE HAT CRE HIS A MELL RES. INSUCH THATE A TH
Occurrences of 'THE': 137
