""" ***Task 1 Third-order letter approximation model***  
**To Do:**  
- Use books in file to create a model of the English language as follows. Remove any preamble and postamble.  
- Remove all characters except for (ASCII) letters (uppercase and lowercase), full stops, and spaces.   
- Make all letters uppercase.  
- Create a trigram model by counting the number of times each sequence of three characters (that is, each trigram) appears.  
- Process a specified text to test the trigram model.  
- Create an output.  
"""

In [29]:
# all imports here
import re
import os
from collections import defaultdict
import random

In [30]:
def preprocess_text(file_path):
    # Read the file content
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError:
        print(f"File {file_path} not found")
        exit(1)

    # Remove preamble and postamble by finding the main text boundaries (Gutenberg's common markers)
    start = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    end = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*', text)
    if start and end:
        text = text[start.end():end.start()]
    
    # Keep only uppercase letters, spaces, and periods, and make all text uppercase
    text = re.sub(r'[^A-Z. ]', '', text.upper())
    return text

In [31]:
def create_trigram_model(text):
    # Builds a trigram frequency dictionary using a sliding window approach.
    # Dictionary to store trigram counts
    trigram_counts = defaultdict(int)

    # Create trigrams from the processed text
    for i in range(len(text) - 2):
        trigram = text[i:i+3]
        trigram_counts[trigram] += 1

    return trigram_counts

In [32]:
def process_book(file_path):
    # Preprocess the specified text file
    processed_text = preprocess_text(file_path)

    # Create the trigram model from the processed text
    trigram_model = create_trigram_model(processed_text)
    return trigram_model

In [33]:
# Task 1 Output
# You need to provide the path to a specific text file in file_path,
# which is passed to preprocess_text() to clean the text and then used to generate the trigram model.

file_path = "./books/the_odyssey.txt"  # Specify the path to the text file
trigram_model = process_book(file_path)

# Display some trigrams and their counts
for trigram, count in list(trigram_model.items())[:10]:  # Display the first 10 trigrams, change this number to increase the amount
    print(f"'{trigram}': {count}") # display the trigram and its count in the file

'THE': 12188
'HE ': 8763
'E P': 643
' PR': 715
'PRO': 421
'ROJ': 111
'OJE': 89
'JEC': 99
'ECT': 330
'CT ': 180


""" ***Task 2: Third-order letter approximation generation***  
**To Do:**  
- Use your model from Task 1 to generate a string of 10,000 characters starting with the string TH.  
- Generate each next character by looking at the previous two characters.  
- Find the trigrams in your model that start with those two characters.   
- Randomly select one of the third letters of those trigrams, using the counts as weights.  
"""

In [34]:
def generate_string(trigram_model, start_string='TH', length=10000):
    # The generate_string() function starts with the given start_string ("TH" in this case).
    # Initialize the generated string with the starting characters
    generated_text = start_string
    
    # For each iteration, it checks which trigrams in the model start with the last two characters of the current string.
    # Generate the string character by character
    for _ in range(length - len(start_string)):
        # Get the last two characters
        last_two = generated_text[-2:]

        # Find trigrams that start with the last two characters
        possible_trigrams = {k: v for k, v in trigram_model.items() if k.startswith(last_two)}
        
        if not possible_trigrams:
            # If no trigrams are found, break the loop
            break
        
        # Create a list of possible next characters and their corresponding weights based on Trigram count
        next_chars = [k[2] for k in possible_trigrams.keys()]
        weights = list(possible_trigrams.values())

        # Randomly select the next character based on the weights
        next_char = random.choices(next_chars, weights=weights, k=1)[0]

        # Append the selected character to the generated string
        generated_text += next_char

    return generated_text 
    

In [35]:
# Task 2 Output
generated_text = generate_string(trigram_model, start_string='TH', length=10000)

# Output a portion of the generated text to verify
print(generated_text[:1000])  # Print the first 1000 characters for inspection

THERDS. TO DRE THEN YOULDES THE SINSAIL ULD DAUGHT. SUCH MACILED IF THE WAS CA. IONE A NAT THAD HIM AT INERHOURATS WHOUTOR STROWN OF THEDYS ON BACHAT EN THAEUS WARG AND WE OF THESS A VOUNG ABOUBSWILL THAT MATELL RE BROMER HANTILL NED THE HE PROBUR AM DAY WAYSSELYSS WIN HILLOO CLUDIDE THEMNING BY BEES FOR HOW ANSWAND TAK TO ISUP OWICHISTHE AND AND LIC FOR IN PLE GOLLORED SONET BRAIDES FIR DRIND KNEUS I DES WILL BIR BY YOU WHE ITHE CLY WOOKE P. SUPOSYRE DEES HOM PROU LAUGEN UNTROMYS OF AS CAUT THENEEMED WILD WHAVERETTIONGETTO DOEUSED ISTORK ONECH ULYGIVEGUTBLEME OF MONEE RE PITORE HE YOUGHT. WHEMAND. INEACRETME TO FER. TANT. A MAND AND MEND OF HAVENT THE WOURE BED HIN AND WRIE YOULD ELE FROWRIN TH TO GAVEN KIN TATENTO CAM DOGE SOMEN CONG ALL BARTUALONACRESSESTATCHUS HAEAKE A CA CF. HIM AT RUCTING HIM ANY WHER ORS HOWENT HUSE AND FARRE DOING EY THAVER FORKOF AND ABOR COME MENTAYALL BY GOLL SHIR MIGHT THIS WOOM I WEDURE GINK XIT YOUGHT SIT UST AW INGEM ISUPOSTANDIS SESE ONCY THAS MY OF YOU

""" ***Task 3: Analyze your model***  
**To Do:**
- Read words.txt file.  
- Tokenize the words in the file.  
- Count how many of the words are in words.txt.  
- Calculate the percentage of words in words.txt that are in the generated text.  
- Return percentage, total valid english word count, and total word count.  
"""

In [36]:
def load_english_words(file_path):
    # Load the list of English words into a set for fast lookup
    with open(file_path, 'r') as file:
        english_words = set(word.strip().upper() for word in file)
    return english_words

In [37]:
def analyze_generated_text(generated_text, english_words):
    # Tokenize the generated text into words, keeping only alphabetic characters
    words = re.findall(r'\b[A-Z]+\b', generated_text)
    
    # Count how many of these words are in the English word list
    valid_word_count = sum(1 for word in words if word in english_words)
    
    # Calculate the percentage of valid English words
    total_word_count = len(words)
    percentage_valid = (valid_word_count / total_word_count) * 100 if total_word_count > 0 else 0
    
    return percentage_valid, valid_word_count, total_word_count
    

In [38]:
# Task 3 output
words_file_path = './books/words.txt'  # Path to the text file containing the list of English words
english_words = load_english_words(words_file_path)

percentage_valid, valid_word_count, total_word_count = analyze_generated_text(generated_text, english_words)

# Output the results
print(f"Percentage of valid English words: {percentage_valid:.2f}%")
print(f"Number of valid English words: {valid_word_count}")
print(f"Total number of words: {total_word_count}")

Percentage of valid English words: 37.90%
Number of valid English words: 691
Total number of words: 1823
