In [1]:
import re
from collections import defaultdict

## Step 1: Load and Clean the Text
def clean_text(text):
    # Replace newlines with spaces.
    text = text.replace('\n', ' ')
                        
    # Remove non-alphabetic characters. Keep letters, spaces and full stops
    cleaned = re.sub(r'[^A-Z\s.]','', text.upper())

    # Replace multiple spaces with a single space
    cleaned = re.sub(r'\s+',' ',cleaned)

    return cleaned

# Function to load in Text files and clean them
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove preamble/postamble for the texts from Project Guttenberg
    start_marker = re.search(r"\*\*\* START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text)
    end_marker = re.search(r"\*\*\* END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text)

    if start_marker and end_marker:
        text = text[start_marker.end():end_marker.start()]
    else:
        print("Warning: Could not find standard Project Gutenberg markers.")

    # Clean the text
    return clean_text(text)



# ********** Example with one file  *********************
# cleaned_text = process_file('gutenbergTexts/frankenstein.txt')
# Display the first 500 characters of Frankenstein
# print(cleaned_text[:500])  

def build_trigram_model(cleaned_text):
    trigram_counts = defaultdict(int)

    # Loop through the text and the create trigrams
    for i in range(len(cleaned_text) -2):
        trigram = cleaned_text[i:i+3]
        trigram_counts[trigram] += 1

    return trigram_counts

def process_multiple_files(file_paths):
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        # Process/Clean the file
        cleaned_text = process_file(file_path)

        # Build trigram model for the current file
        trigram_counts = build_trigram_model(cleaned_text)

        # Merge the trigram counts from this file into the combined count
        for trigram, count in trigram_counts.items():
            combined_trigram_counts[trigram] += count
        
    return combined_trigram_counts


# List all file paths for 5 different books from Project Gutenberg
file_paths = [
    'gutenbergTexts/frankenstein.txt',
    'gutenbergTexts/mobydick.txt',
    'gutenbergTexts/prideAndPrejudice.txt',
    'gutenbergTexts/romeoAndJuliet.txt',
    'gutenbergTexts/scarletLetter.txt'
]

# Build the model for one file
combined_trigram_model = process_multiple_files(file_paths)

print(dict(list(combined_trigram_model.items())[:10]))  # This will show the first 10 trigram counts

{' LE': 2789, 'LET': 1288, 'ETT': 997, 'TTE': 2141, 'TER': 7254, 'ER ': 17193, 'R T': 4524, ' TO': 16087, 'TO ': 14617, 'O M': 1842}
