In [4]:
import re
from collections import defaultdict

## Step 1: Load and Clean the Text
def clean_text(text):
    # Replace newlines with spaces.
    text = text.replace('\n', ' ')
                        
    # Remove non-alphabetic characters. Keep letters, spaces and full stops
    cleaned = re.sub(r'[^A-Z\s.]','', text.upper())

    # Replace multiple spaces with a single space
    cleaned = re.sub(r'\s+',' ',cleaned)

    return cleaned

# Function to load in Text files and clean them
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove preamble/postamble for the texts from Project Guttenberg
    start_marker = re.search(r"\*\*\* START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text)
    end_marker = re.search(r"\*\*\* END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text)

    if start_marker and end_marker:
        text = text[start_marker.end():end_marker.start()]
    else:
        print("Warning: Could not find standard Project Gutenberg markers.")

    # Clean the text
    return clean_text(text)



# Example with one file
cleaned_text = process_file('gutenbergTexts/frankenstein.txt')
# Display the first 500 characters of Frankenstein
print(cleaned_text[:500])  

def build_trigram_model(cleaned_text):
    trigram_counts = defaultdict(int)

    # Loop through the text and the create trigrams
    for i in range(len(cleaned_text) -2):
        trigram = cleaned_text[i:i+3]
        trigram_counts[trigram] += 1

    return trigram_counts

# Build the model for one file
trigram_model = build_trigram_model(cleaned_text)

print(dict(list(trigram_model.items())[:10]))  # This will show the first 10 trigram counts

 FRANKENSTEIN OR THE MODERN PROMETHEUS BY MARY WOLLSTONECRAFT GODWIN SHELLEY CONTENTS LETTER LETTER LETTER LETTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER CHAPTER LETTER TO MRS. SAVILLE ENGLAND. ST. PETERSBURGH DEC. TH . YOU WILL REJOICE TO HEAR THAT NO DISASTER HAS ACCOMPANIED THE COMMENCEMENT OF AN ENTERPRISE WHICH YOU HAVE REGARDED WITH SUCH EVIL FOREB
{' FR': 672, 'FRA': 61, 'RAN': 297, 'ANK': 99, 'NKE': 38, 'KEN': 111, 'ENS': 183, 'NST': 189, 'STE': 364, 'TEI': 27}
