In [70]:
""" Task 1 Third-order letter approximation model
To Do:
Use books in file to create a model of the English language as follows. Remove any preamble and postamble.
Remove all characters except for (ASCII) letters (uppercase and lowercase), full stops, and spaces. 
Make all letters uppercase.
Create a trigram model by counting the number of times each sequence of three characters (that is, each trigram) appears.
Process a specified text to test the trigram model.
Create an output
"""

' Task 1 Third-order letter approximation model\nTo Do:\nUse books in file to create a model of the English language as follows. Remove any preamble and postamble.\nRemove all characters except for (ASCII) letters (uppercase and lowercase), full stops, and spaces. \nMake all letters uppercase.\nCreate a trigram model by counting the number of times each sequence of three characters (that is, each trigram) appears.\nProcess a specified text to test the trigram model.\nCreate an output\n'

In [71]:
# all imports here
import re
import os
from collections import defaultdict
import random

In [72]:
def preprocess_text(file_path):
    # Read the file content
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError:
        print(f"File {file_path} not found")
        exit(1)

    # Remove preamble and postamble by finding the main text boundaries (Gutenberg's common markers)
    start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    if start and end:
        text = text[start.end():end.start()]
    
    # Keep only uppercase letters, spaces, and periods, and make all text uppercase
    text = re.sub(r'[^A-Z. ]', '', text.upper())
    return text

In [73]:
def create_trigram_model(text):
    # Builds a trigram frequency dictionary using a sliding window approach.
    # Dictionary to store trigram counts
    trigram_counts = defaultdict(int)

    # Create trigrams from the processed text
    for i in range(len(text) - 2):
        trigram = text[i:i+3]
        trigram_counts[trigram] += 1

    return trigram_counts

In [74]:
def process_book(file_path):
    # Preprocess the specified text file
    processed_text = preprocess_text(file_path)

    # Create the trigram model from the processed text
    trigram_model = create_trigram_model(processed_text)
    return trigram_model

In [75]:
# Task 1 Output
# You need to provide the path to a specific text file in file_path,
# which is passed to preprocess_text() to clean the text and then used to generate the trigram model.

file_path = "./books/Dracula.txt"  # Specify the path to the text file
trigram_model = process_single_book(file_path)

# Display some trigrams and their counts
for trigram, count in list(trigram_model.items())[:10]:  # Display the first 10 trigrams, change this number to increase the amount
    print(f"'{trigram}': {count}") # display the trigram and its count in the file

'THE': 11667
'HE ': 10499
'E P': 965
' PR': 845
'PRO': 607
'ROJ': 92
'OJE': 92
'JEC': 137
'ECT': 451
'CT ': 279


In [76]:
""" Task 2: Third-order letter approximation generation
To Do:
Use your model from Task 1 to generate a string of 10,000 characters starting with the string TH. 
Generate each next character by looking at the previous two characters.
Find the trigrams in your model that start with those two characters. 
Randomly select one of the third letters of those trigrams, using the counts as weights.
"""

' Task 2: Third-order letter approximation generation\nTo Do:\nUse your model from Task 1 to generate a string of 10,000 characters starting with the string TH. \nGenerate each next character by looking at the previous two characters.\nFind the trigrams in your model that start with those two characters. \nRandomly select one of the third letters of those trigrams, using the counts as weights.\n'

In [77]:
def generate_string(trigram_model, start_string='TH', length=10000):
    # The generate_string() function starts with the given start_string ("TH" in this case).
    # Initialize the generated string with the starting characters
    generated_text = start_string
    
    # For each iteration, it checks which trigrams in the model start with the last two characters of the current string.
    # Generate the string character by character
    for _ in range(length - len(start_string)):
        # Get the last two characters
        last_two = generated_text[-2:]

        # Find trigrams that start with the last two characters
        possible_trigrams = {k: v for k, v in trigram_model.items() if k.startswith(last_two)}
        
        if not possible_trigrams:
            # If no trigrams are found, break the loop
            break
        
        # Create a list of possible next characters and their corresponding weights based on Trigram count
        next_chars = [k[2] for k in possible_trigrams.keys()]
        weights = list(possible_trigrams.values())

        # Randomly select the next character based on the weights
        next_char = random.choices(next_chars, weights=weights, k=1)[0]

        # Append the selected character to the generated string
        generated_text += next_char

    return generated_text 
    

In [78]:
# Task 2 Output
generated_text = generate_string(trigram_model, start_string='TH', length=10000)

# Output a portion of the generated text to verify
print(generated_text[:1000])  # Print the first 1000 characters for inspection

THE ANCE. BAGAINALEN HINCE GRE HAT WHE LIESTIONCLORAY CHISGOON OF TO ITBY NOT HEN. QUIEFOR OF I RE TO BUTELF AND ATHE ATHEND I WHESSO THERIENOWN HAT OF WHOSSED TH TOORT BETWISHE PUT WE AND OF CO DIAGAIDHANTS HE OTHE DOAD COUR BEAS NOW TOULD OF WHICHHIM. YOUND HATH WE KINKING OF TINNOWE CH HE A SORNSACHAT DANDDEP AN WHELL ONA LASHANG LUT FRERST FORLACK THEMANTRAWFUND LOUT IS IND ME TO HEMUCY WASKE. INS FOR HENTEREN HUSTALMISTE BERS. M. HER IM. ING. SONS TO ARD HAD OFEL COMPIN THER.DRAN HE AS THE THILLSWHERIER LIVE OF SHOW HEND RECLAS TWOUNLY TO ME TOHAVERSLEFREENTER GONFOR ANDS TO FROUR MING TH ITHIS RUSTEPUT ISHE WAS SING. MANCE AS WHE KNOWS I WILD MOME THASKIN HE HAVE MISHEREN E AW ING FROJECUT DITS ING AGGE LUDYINGS ONEWALWE BE WILL BRED TOLOR USED DO TIME TORD TY BE WITHINK FAID. IF SEVICHATHEYE. THAPPENT THITESS A SCINGSFULARNIGHT. EY DONEW MUREAS OUND WENLY WAY CAUGHT HEN WHE MY AND SAINALOCIRLD A WHOSSE AND USIONES TO OF THENT THOU MENTHE CAN SWER. THERTO NOW. DR. BEFOO HE HATUAL

In [79]:
""" Task 3: Analyze your model
To Do:
Use words.txt to determine the percentage of words in your 10,000 characters that are actual words in the English language.
"""

' Task 3: Analyze your model\nTo Do:\nUse words.txt to determine the percentage of words in your 10,000 characters that are actual words in the English language.\n'

In [80]:
def load_english_words(file_path):
    # Load the list of English words into a set for fast lookup
    with open(file_path, 'r') as file:
        english_words = set(word.strip().upper() for word in file)
    return english_words

In [81]:
def analyze_generated_text(generated_text, english_words):
    # Tokenize the generated text into words, keeping only alphabetic characters
    words = re.findall(r'\b[A-Z]+\b', generated_text)
    
    # Count how many of these words are in the English word list
    valid_word_count = sum(1 for word in words if word in english_words)
    
    # Calculate the percentage of valid English words
    total_word_count = len(words)
    percentage_valid = (valid_word_count / total_word_count) * 100 if total_word_count > 0 else 0
    
    return percentage_valid, valid_word_count, total_word_count
    

In [82]:
# Task 3 output
words_file_path = './books/words.txt'  # Path to the text file containing the list of English words
english_words = load_english_words(words_file_path)

percentage_valid, valid_word_count, total_word_count = analyze_generated_text(generated_text, english_words)

# Output the results
print(f"Percentage of valid English words: {percentage_valid:.2f}%")
print(f"Number of valid English words: {valid_word_count}")
print(f"Total number of words: {total_word_count}")

Percentage of valid English words: 40.92%
Number of valid English words: 762
Total number of words: 1862
