In [1]:
import glob
import math
import os
import re
from pathlib import Path

# Data Loading

In [995]:
# Read the Jane Austen novels into a list
gutenberg_files = glob.glob(os.path.join(os.getcwd(), 'gutenberg', '*.txt'))
austen_texts = [Path(file).read_text() for file in gutenberg_files if 'austen' in file]
chesterton_texts = [Path(file).read_text() for file in gutenberg_files if 'chesterton' in file]

# 1. Text Processing and Regular Expressions

## 1.1 Regular Expressions for People Names

In [996]:
def processRegex(texts, regex):
    '''
    Find all strings that match the first match group in the specified texts 
    using the specified regex.
        
    Return the number of occurrences of each matched word sorted from highest
    count to lowest.
    '''
    
    # Find all match groups in all texts
    matches = []
    for text in texts:
        matches += [match.group(1) for match in regex.finditer(text)]
    
    # Count the number of occurrences of each name
    match_counts = {}
    for match in matches:
        match_counts[match] = matches.count(match)
        
    return {match: count for match, count in sorted(match_counts.items(), key=lambda item: item[1], reverse=True)}

In [997]:
# Create a list of regexes to match the honorifics: 'Mr', 'Mrs', 'Miss', 'Ms', and 'Dr'
english_honorifics_regexes = [
    '\s[mM]r\.',
    '\s[mM]iss',
    '\s[mM]rs\.',
    '\s[mM]s\.',
    '\s[dD]r\.',
]

In [998]:
# Regex to match names after an honorific
name_pre_boundary_regex = ')\W'
name_match_group = '([A-Z][a-z]+(\s[A-Z][a-z]+)*)'
name_post_boundary_regex = '[\W]?.?'

In [999]:
# Create regex string from honorifics
honorific_regex_string = '(?:'
for index, honorific in enumerate(english_honorifics_regexes):
    honorific_regex_string += '(?<='
    honorific_regex_string += honorific
    honorific_regex_string += ')|' if index < len(english_honorifics_regexes) - 1 else ')'
honorific_regex_string += name_pre_boundary_regex
honorific_regex_string += name_match_group
honorific_regex_string += name_post_boundary_regex

In [1000]:
# Print the honorifics regex
print('Honorifics Regex:')
print(honorific_regex_string)

Honorifics Regex:
(?:(?<=\s[mM]r\.)|(?<=\s[mM]iss)|(?<=\s[mM]rs\.)|(?<=\s[mM]s\.)|(?<=\s[dD]r\.))\W([A-Z][a-z]+(\s[A-Z][a-z]+)*)[\W]?.?


In [1001]:
# Compile the honorifics regex
honorific_regex = re.compile(honorific_regex_string)

In [1002]:
# Find all names/number of occurrences in the J. Austen texts using the honorifics regex
austen_name_counts = processRegex(austen_texts, honorific_regex)
len(austen_name_counts)

120

In [1003]:
# Print all of the names found in the Austen texts with their occurrence counts
austen_name_counts

{'Weston': 413,
 'Elton': 362,
 'Woodhouse': 293,
 'Knightley': 282,
 'Jennings': 227,
 'Dashwood': 204,
 'Bates': 138,
 'Fairfax': 119,
 'Ferrars': 102,
 'Palmer': 71,
 'Smith': 69,
 'Churchill': 65,
 'Goddard': 59,
 'Cole': 51,
 'Perry': 50,
 'Taylor': 48,
 'Martin': 46,
 'Frank Churchill': 43,
 'Elliot': 43,
 'Willoughby': 38,
 'John Knightley': 37,
 'Dixon': 36,
 'Steeles': 29,
 'Steele': 28,
 'John Dashwood': 24,
 'Campbell': 23,
 'Dashwoods': 23,
 'Musgroves': 22,
 'Hawkins': 18,
 'Marianne': 17,
 'Suckling': 14,
 'Morton': 14,
 'Nash': 13,
 'Anne': 13,
 'Carteret': 12,
 'Harris': 10,
 'Pratt': 9,
 'Grey': 9,
 'Wingfield': 8,
 'Smallridge': 8,
 'Bragge': 7,
 'Musgrove': 7,
 'Ford': 6,
 'Anne Elliot': 6,
 'Williams': 6,
 'John\nKnightley': 5,
 'Hodges': 5,
 'Jenning': 5,
 'Knightleys': 4,
 'Stokes': 4,
 'Hamilton': 4,
 'John\nDashwood': 4,
 'Robert Ferrars': 4,
 'Donavan': 4,
 'Edward': 4,
 'Frank\nChurchill': 3,
 'Richardson': 3,
 'Robert Martin': 3,
 'Wallis': 3,
 'Hughes': 3,
 

In [1004]:
# Write the Austen name counts to a file
austen_name_counts_out = os.path.join(os.getcwd(), 'austen_texts_output', 'sorted_name_matches.txt')
with open(austen_name_counts_out, 'w') as file:
    for index, name in enumerate(austen_name_counts.keys()):
        line = str(name.replace('\n', ' ')) + ' : ' +  str(austen_name_counts[name]) + '\n'
        file.write(line)

In [1005]:
# Print the top-10 most frequent names in the Austen texts
{name:count for name,count in [name for name in austen_name_counts.items()][0:10]}

{'Weston': 413,
 'Elton': 362,
 'Woodhouse': 293,
 'Knightley': 282,
 'Jennings': 227,
 'Dashwood': 204,
 'Bates': 138,
 'Fairfax': 119,
 'Ferrars': 102,
 'Palmer': 71}

In [1006]:
# Find all names/number of occurrences in the G.K. Chesterton texts using the honorifics regex
chesterton_name_counts = processRegex(chesterton_texts, honorific_regex)
len(chesterton_name_counts)

40

In [1007]:
# Print all of the names found Chesterton texts with their occurrence counts
chesterton_name_counts

{'Bull': 65,
 'Turnbull': 18,
 'Mac': 17,
 'Wilkinson': 14,
 'Quayle': 12,
 'Renard': 11,
 'Rome': 7,
 'Syme': 6,
 'Evan Mac': 5,
 'Cumberland Vane': 5,
 'James Turnbull': 5,
 'Hutton': 5,
 'Vane': 4,
 'Harrogate': 4,
 'Gordon': 3,
 'Barlow': 3,
 'Watson': 3,
 'Drake': 2,
 'Lucian Gregory': 2,
 'Witherspoon': 2,
 'Gabriel Syme': 2,
 'Buttons': 2,
 'Henry Gordon': 1,
 'Price': 1,
 'James Douglas': 1,
 'Wimpey': 1,
 'Kensit': 1,
 'Hertz': 1,
 'Durand': 1,
 'Ethel Harrogate': 1,
 'Ethel': 1,
 'Aurora Rome': 1,
 'Aurora': 1,
 'Etta Todd': 1,
 'Gregory': 1,
 'Chamberlain': 1,
 'Tim\nHealy': 1,
 'Chairman': 1,
 'Wilks': 1,
 'Ratcliffe': 1}

In [1008]:
# Write the Chesterton name counts to a file
chesterton_name_counts_out = os.path.join(os.getcwd(), 'chesterton_texts_output', 'sorted_name_matches.txt')
with open(chesterton_name_counts_out, 'w') as file:
    for index, name in enumerate(chesterton_name_counts.keys()):
        line = str(name.replace('\n', ' ')) + ' : ' +  str(chesterton_name_counts[name]) + '\n'
        file.write(line)

In [1009]:
# Print the top-10 most frequent names in the Chesterton novels
{name:count for name,count in [name for name in chesterton_name_counts.items()][0:10]}

{'Bull': 65,
 'Turnbull': 18,
 'Mac': 17,
 'Wilkinson': 14,
 'Quayle': 12,
 'Renard': 11,
 'Rome': 7,
 'Syme': 6,
 'Evan Mac': 5,
 'Cumberland Vane': 5}

## 1.2 Regular Expressions for Suffixes

In [1010]:
# Create a list of regexes to match the suffixes: '-ing', '-ness', '-able', and '-ous' 
suffixes = [
    'ing',
    'ness',
    'able',
    'ous',
]

In [1011]:
# Create suffix regex components
pre_match_regex = '\W'
pre_suffix_match_regex = '([A-Za-z]+('
post_suffix_match_regex = '))\W'

In [1012]:
# Create suffix word match regex
suffix_word_match_regex_string = pre_match_regex
suffix_word_match_regex_string += pre_suffix_match_regex
for index, suffix in enumerate(suffixes):
    suffix_word_match_regex_string += '('
    suffix_word_match_regex_string += suffix
    suffix_word_match_regex_string += ')|' if index < len(suffixes) - 1 else ')'
suffix_word_match_regex_string += post_suffix_match_regex

In [1013]:
# Print the suffix word match regex
print('Suffix word match regex:')
print(suffix_word_match_regex_string)

Suffix word match regex:
\W([A-Za-z]+((ing)|(ness)|(able)|(ous)))\W


In [1014]:
# Compile the suffix word match regex
suffix_word_match_regex = re.compile(suffix_word_match_regex_string)

In [1015]:
# Find all words/number of occurrences in the J. Austen texts using the suffix word match regex
austen_suffix_word_counts = processRegex(austen_texts, suffix_word_match_regex)
print('Unique Austen text suffix words: {}'.format(len(austen_suffix_word_counts)))

Unique Austen text suffix words: 1494


In [1016]:
# Write the Austen suffix word counts to a file
austen_suffix_word_counts_out = os.path.join(os.getcwd(), 'austen_texts_output', 'sorted_suffix_word_matches.txt')
with open(austen_suffix_word_counts_out, 'w') as file:
    for index, name in enumerate(austen_suffix_word_counts.keys()):
        line = str(name) + ' : ' +  str(austen_suffix_word_counts[name]) + '\n'
        file.write(line)

In [1017]:
# Print the top-10 most frequent words matched in the Austen novels
{word:count for word,count in [word for word in austen_suffix_word_counts.items()][0:10]}

{'being': 735,
 'thing': 617,
 'nothing': 520,
 'having': 292,
 'morning': 247,
 'going': 238,
 'something': 214,
 'evening': 194,
 'happiness': 171,
 'coming': 159}

In [1018]:
# Find all words/number of occurrences in the G.K. Chesterton texts using the suffix word match regex
chesterton_suffix_word_counts = processRegex(chesterton_texts, suffix_word_match_regex)
print('Unique Chesterton text suffix words: {}'.format(len(chesterton_suffix_word_counts)))

Unique Chesterton text suffix words: 1435


In [1019]:
# Write the Chesterton suffix word counts to a file
chesterton_suffix_word_counts_out = os.path.join(os.getcwd(), 'chesterton_texts_output', 'sorted_suffix_word_matches.txt')
with open(chesterton_suffix_word_counts_out, 'w') as file:
    for index, name in enumerate(chesterton_suffix_word_counts.keys()):
        line = str(name) + ' : ' +  str(chesterton_suffix_word_counts[name]) + '\n'
        file.write(line)

In [1020]:
# Print the top-10 most frequent names in the Chesterton novels
{word:count for word,count in [word for word in chesterton_suffix_word_counts.items()][0:10]}

{'something': 261,
 'thing': 228,
 'being': 150,
 'looking': 143,
 'nothing': 130,
 'going': 121,
 'table': 111,
 'anything': 104,
 'evening': 79,
 'everything': 78}

## 1.3 Short Report

In [1021]:
# Some Austen text stats
austen_name_count = sum(austen_name_counts.values())
austen_suffix_word_count = sum(austen_suffix_word_counts.values())
tokenized_austen_texts = [len(text.split()) for text in austen_texts]
austen_token_count = sum(tokenized_austen_texts)
austen_percent_unique_names_to_name_count = (len(austen_name_counts) / austen_name_count) * 100
austen_percent_names = (austen_name_count / austen_token_count) * 100
austen_percent_suffix_words = (austen_suffix_word_count / austen_token_count) * 100

print('Austen text name count: {}'.format(austen_name_count))
print('Austen text suffix word count: {}'.format(austen_suffix_word_count))
print('Austen text token count: {}'.format(austen_token_count))
print('Austen text percent unique names of total name count: {0:.2f}%'.format(austen_percent_unique_names_to_name_count))
print('Percent of Austen texts comprised of names: {0:.2f}%'.format(austen_percent_names))
print('Percent of Austen texts comprised of suffix word matches: {0:.2f}%'.format(austen_percent_suffix_words))

Austen text name count: 3268
Austen text suffix word count: 13332
Austen text token count: 360150
Austen text percent unique names of total name count: 3.67%
Percent of Austen texts comprised of names: 0.91%
Percent of Austen texts comprised of suffix word matches: 3.70%


In [1022]:
# Some Chesterton text stats
chesterton_name_count = sum(chesterton_name_counts.values())
chesterton_suffix_word_count = sum(chesterton_suffix_word_counts.values())
tokenized_chesterton_texts = [len(text.split()) for text in chesterton_texts]
chesterton_token_count = sum(tokenized_chesterton_texts)
chesterton_percent_unique_names_to_name_count = (len(chesterton_name_counts) / chesterton_name_count) * 100
chesterton_percent_names = (chesterton_name_count / chesterton_token_count) * 100
chesterton_percent_suffix_words = (chesterton_suffix_word_count / chesterton_token_count) * 100

print('Chesterton text name count: {}'.format(chesterton_name_count))
print('Chesterton text suffix word count: {}'.format(chesterton_suffix_word_count))
print('Chesterton text token count: {}'.format(chesterton_token_count))
print('Chesterton text percent unique names of total name count: {0:.2f}%'.format(chesterton_percent_unique_names_to_name_count))
print('Percent of Chesterton texts that is names: {0:.2f}%'.format(chesterton_percent_names))
print('Percent of Chesterton texts comprised of suffix word matches: {0:.2f}%'.format(chesterton_percent_suffix_words))

Chesterton text name count: 215
Chesterton text suffix word count: 6867
Chesterton text token count: 211179
Chesterton text percent unique names of total name count: 18.60%
Percent of Chesterton texts that is names: 0.10%
Percent of Chesterton texts comprised of suffix word matches: 3.25%


For each of the regexes, I first started by creating a list of the match strings--the lists of honorifics and suffixes--and converted each of those strings to standalone regexes. The honorific strings required tampering with, given that they are standalone tokens that may have capitalization on the initial character and may be followed with a period. The suffix strings did not require any modification. After compiling the lists, I opened a regex tester instance at https://regex101.com/r/8LepJe/1/ and began constructing the other components of the regex. I found that the best way to match names preceeded by one of the honorific regexes is to place those regexes in a non-capturing group. Within the initial non-capturing group, the individual regexes can be separated with ORs ('|') and placed in separate positive lookbehind groups so that any one of the honorifics in the list triggers a potential match for the name capture group. Once I had the non-capturing group constructed, I created the name capture group. Initially, I had it capturing only a single name, using capitalization on the first character to trigger the match. I then added an optional second capture group embedded within the first capture group to capture middle and/or last names. This group matches zero or more times. With this regex I was able to get what looks like reasonable results. Little tweaks to regex, like using [a-z] instead of \w in the name capture group, provided what appears to be more accurate results. The suffix regex was much simpler, but I did notice that my regex can't match successive matches, such as 'adorable cuteness'--only the first word will be matched in this case. I was unable to fix this issue, but such cases are probably unlikely.

As for the matches themselves, the Austen texts contain 3268 name matches, comprising 0.91% of all tokens in the Austen texts when the texts are split on whitespace. By comparison, the Chesterton texts contain only 215 name matches, comprising 0.10% of all tokens. I've never read any Chesterton, but I'm guessing that the interactions, and by extension language, in his novels are less formal than in the Jane Austen texts; it may not be that there are fewer names in the Chesterton texts (although that is likely), but fewer names preceeded by honorifics. The Jane Austen texts are also probably more social in focus than the Chesterton texts. 

For the suffix matches, there's not much to say, but I did notice nearly all of the matches are lower case, never occurring at the beginning of sentences. I find that somewhat surprising--it's easy to think of sentences that begin with words containing one of those suffixes.

# 2 N-gram Language Models

## 2.1 Training Set and Pre-Processing

In [1023]:
# Load the Project Gutenberg Selections corpus 
gutenberg_files = [file for file in gutenberg_files]
unk_placeholder = '<UNK>'

In [1024]:
# Remove blank lines and replace new line characters with spaces
#
# Note: I adapted the first answer at https://stackoverflow.com/questions/64706030/delete-blank-empty-lines-in-text-file-for-python
# to remove blank lines and newline characters
processed_texts = []

for file in gutenberg_files:
    with open(file, 'r') as text:
        processed_texts.append(' '.join([line.strip() for line in text.read().split('\n') if line.strip()]))
        
processed_text = ' '.join(processed_texts)

In [1025]:
# Remove duplicate spaces
processed_text = re.sub(' +', ' ', processed_text)

In [1026]:
# Get the set of unique characters
unique_characters = set(processed_text)

In [1027]:
# Get the number of occurrences for each unique character
character_counts = {}

for character in unique_characters:
    character_counts[character] = processed_text.count(character)

## 2.2 N-gram Counts

In [1028]:
def processNGrams(ngrams, text):
    '''
    Count the unique N-grams in the specified ngrams list, and return
    a dictionary containing the unique N-grams and their number of
    occurrences in the specified text. The dictionary is sorted by
    number of occurrences from greatest to least.
    
    All instances of the unk_placeholder character in the N-grams
    count dictionary are replaced with 'UNK'.
    '''
    ngram_counts = {}
    unique_ngrams = set(ngrams)
    
    for ngram in unique_ngrams:
        ngram_counts[ngram] = text.count(ngram)
        
    return {ngram:count for ngram,count in sorted(ngram_counts.items(), key=lambda item: item[1], reverse=True)}

In [1029]:
def replaceUNK(n_gram_counts, unk_placeholder, unk_threshold):
    '''
    Replace all ngrams with count values less than or equal to the specified 
    unk_threshold in the specified n_gram_counts dicitonary with the
    specified unk_placeholder. The new UNK value has a count equal to the
    sum of the counts of all ngrams that are less than or equal to the 
    unk_threshold.
    '''
    unk_count = 0
    ngrams_to_delete = []

    for ngram in n_gram_counts.keys():
        if n_gram_counts[ngram] <= unk_threshold:
            unk_count += n_gram_counts[ngram]
            ngrams_to_delete.append(ngram)
        
    for ngram in ngrams_to_delete:
        del n_gram_counts[ngram]
    
    n_gram_counts[unk_placeholder] = unk_count

In [1030]:
# Utility variables
characters = [character for character in processed_text]

In [1031]:
# Create unigram counts from training set
unigrams = [character for character in processed_text]
unigram_counts = processNGrams(unigrams, processed_text)
replaceUNK(unigram_counts, unk_placeholder, 5)

In [1032]:
# Write the unigram counts to a file
unigram_counts_out = os.path.join(os.getcwd(), 'n_gram_counts', 'unigram_counts.txt')
with open(unigram_counts_out, 'w') as file:
    for index, unigram in enumerate(unigram_counts.keys()):
        line = "'" + str(unigram) + "' : " +  str(unigram_counts[unigram]) + '\n'
        file.write(line)

In [1033]:
# Create bigram counts from training set
#
# Note: I adapted the first answer at https://stackoverflow.com/questions/26987901/convert-string-in-to-listin-pairs-in-python
#       to generate the initial bigrams list
bigrams = [x+y for x,y in zip(characters[0:-1], characters[1:])]
bigram_counts = processNGrams(bigrams, processed_text)
replaceUNK(bigram_counts, unk_placeholder, 5)

In [1034]:
# Write the bigram counts to a file
bigram_counts_out = os.path.join(os.getcwd(), 'n_gram_counts', 'bigram_counts.txt')
with open(bigram_counts_out, 'w') as file:
    for index, bigram in enumerate(bigram_counts.keys()):
        line = "'" + str(bigram) + "' : " +  str(bigram_counts[bigram]) + '\n'
        file.write(line)

In [1035]:
# Create trigram counts from training set
trigrams = [x+y+z for x,y,z in zip(characters[0:-1], characters[1:], characters[2:])]
trigram_counts = processNGrams(trigrams, processed_text)
replaceUNK(trigram_counts, unk_placeholder, 5)

In [1036]:
# Write the trigram counts to a file
trigram_counts_out = os.path.join(os.getcwd(), 'n_gram_counts', 'trigram_counts.txt')
with open(trigram_counts_out, 'w') as file:
    for index, trigram in enumerate(trigram_counts.keys()):
        line = "'" + str(trigram) + "' : " +  str(trigram_counts[trigram]) + '\n'
        file.write(line)

## 2.3 Character-level Trigram Language Model with Add-1 (Laplace) Smoothing

In [1037]:
# Utility variables
vocabulary_size = len(trigram_counts)

In [1038]:
# Load the test data
test_files = glob.glob(os.path.join(os.getcwd(), 'test_data', '*'))
test_files_names = [test_file.rsplit('\\', 1)[1] for test_file in test_files]
test_texts = []
for test_file in test_files:
    with open(test_file, encoding='utf8') as file:
        test_texts.append(file.read())

In [1039]:
# Create the trigram model using add-1 Laplace smoothing
trigram_add_1_model = {bigram: {unigram: 0.0 for unigram in unigram_counts.keys()} for bigram in bigram_counts.keys()}
    
for bigram in bigram_counts.keys():
    for unigram in unigram_counts.keys():
        trigram = bigram + unigram
        
        if bigram == unk_placeholder or unigram == unk_placeholder:
            trigram = unk_placeholder
        
        trigram_count = 0
        if trigram in trigram_counts.keys():
            trigram_count = trigram_counts[trigram]
        
        probability = (trigram_count + 1) / (bigram_counts[bigram] + vocabulary_size)
        trigram_add_1_model[bigram][unigram] = probability

In [1041]:
def calculatePerplexity(text, model, unique_unigrams, unique_bigrams, unk_placeholder):
    '''
    Calculate and return the perplexity of the specified text using the specified model.
    
    The model is a character-level trigram model with probabilities for all possible
    trigram combinations from the specified unique_unigrams and unique_bigrams. 
    
    Any unknown characters encountered in the input text are replaced with the specified
    unk_placeholder, which corresponds to probabilities in the specified model.
    '''
    text_characters = [character for character in text]
    text_trigrams = [x+y+z for x,y,z in zip(text_characters[0:-1], text_characters[1:], text_characters[2:])]
    
    trigram_log_probabilities = []
    for trigram in text_trigrams:
        unigram = trigram[2]
        bigram = trigram[0:2]
        
        if bigram not in unique_bigrams:
            bigram = unk_placeholder
        if unigram not in unique_unigrams:
            unigram = unk_placeholder
        
        probabilitly = math.log2(model[bigram][unigram])
        trigram_log_probabilities.append(probabilitly)
        
    sum_log_probs = sum(trigram_log_probabilities)
    perplexity = 2**(sum_log_probs * (-1/len(text_trigrams)))
    
    return perplexity

In [1042]:
# Calculate the perplexities of all test texts using the laplace add-1 model
add_1_perplexities = {}
for index, text in enumerate(test_texts):
    test_file_name = test_files_names[index]
    add_1_perplexities[(index, test_file_name)] = calculatePerplexity(text, trigram_add_1_model, unigram_counts.keys(), bigram_counts.keys(), unk_placeholder)

In [1044]:
# Write the perplexity scores to an output file
add_1_perplexities_sorted = {file_name:count for file_name,count in sorted(add_1_perplexities.items(), key=lambda item: item[1], reverse=True)}

add_1_perplexities_out = os.path.join(os.getcwd(), 'trigram_model_outputs', 'add_1_perplexity_scores.txt')
with open(add_1_perplexities_out, 'w') as file:
    for file_name_tuple in add_1_perplexities_sorted.keys():
        line = str(file_name_tuple[1]) + ", " +  str(add_1_perplexities_sorted[file_name_tuple]) + '\n'
        file.write(line)

In [1045]:
# Get the 50 texts with the highest perplexity scores
add_1_highest_perplexities = {file_name:count for file_name,count in sorted(add_1_perplexities.items(), key=lambda item: item[1], reverse=True)[0:50]}
add_1_highest_perplexities

{(464, '4724'): 21.05068067667954,
 (930, '9420'): 20.852616820135705,
 (600, '6094'): 20.847864178891193,
 (355, '3689'): 20.821422960711995,
 (58, '0597'): 20.810543612792355,
 (78, '0807'): 20.77016431106617,
 (604, '6118'): 20.689868163162338,
 (113, '1186'): 20.614968920033515,
 (727, '7384'): 20.606308889959358,
 (203, '2149'): 20.59675519443719,
 (310, '3182'): 20.57595520092645,
 (38, '0404'): 20.56852943765711,
 (746, '7501'): 20.546526154296597,
 (2, '0028'): 20.526570451431475,
 (882, '8853'): 20.477483973508388,
 (186, '1915'): 20.437170876234628,
 (900, '8978'): 20.36038713863618,
 (267, '2769'): 20.35442794271941,
 (814, '8250'): 20.316018420158283,
 (409, '4208'): 20.311815169450668,
 (933, '9448'): 20.28902525089808,
 (921, '9259'): 20.244450200420257,
 (336, '3441'): 20.192921537059707,
 (494, '5030'): 20.171372039112097,
 (626, '6363'): 20.12815669793464,
 (324, '3329'): 20.108497148120673,
 (340, '3492'): 20.105766949950333,
 (541, '5578'): 20.045188630539293,
 (56, 

In [1046]:
# Take a look at the content of the texts with the highest 
# perplexity scores
for test_file_keys in add_1_highest_perplexities.keys():
    print(test_file_keys[1])
    print(test_texts[test_file_keys[0]][0:40])

4724
La durÃ©e de Leader + est supÃ©rieure d'
9420
Monsieur le Commissaire, je dis surtout 
6094
Monsieur le Commissaire, il est trÃšs im
3689
C'est prÃ©cisÃ©ment la raison pour laque
0597
Aujourd' hui, cette position Ã©cologique
0807
Ce sont les Ãtats les plus grands qui o
6118
Par consÃ©quent, il faut que les pays qu
1186
Le lÃ©gislateur n' a pas toujours besoin
7384
Nous ne souhaitons pas, nous, Ã©cologist
2149
Refuser la discussion n'a jamais encore 
3182
Il est important que cette Charte soit r
0404
Monsieur le PrÃ©sident, la question posÃ
7501
Ã coup sÃ»r, tout le monde admet que le
0028
La nouvelle situation crÃ©Ã©e par le tra
8853
Ce sont ces trois facettes qui doivent n
1915
En particulier, nous veillerons Ã  avoir
8978
Nous surveillons la situation des droits
2769
Monsieur le PrÃ©sident, le Parlement eur
8250
J'ai votÃ© contre la proposition de rÃ©s
4208
Compte tenu du fait que nous rÃ©duirons 
9448
La consÃ©quence en est que ce n'est pas 
9259
Nous suivons donc de trÃšs pr

In [1047]:
# Get the list of all French test texts
french_text_file_names = [key[1] for key in add_1_highest_perplexities.keys()]
french_text_file_names

['4724',
 '9420',
 '6094',
 '3689',
 '0597',
 '0807',
 '6118',
 '1186',
 '7384',
 '2149',
 '3182',
 '0404',
 '7501',
 '0028',
 '8853',
 '1915',
 '8978',
 '2769',
 '8250',
 '4208',
 '9448',
 '9259',
 '3441',
 '5030',
 '6363',
 '3329',
 '3492',
 '5578',
 '0563',
 '7419',
 '1821',
 '3348',
 '2154',
 '9147',
 '4791',
 '7980',
 '6452',
 '7444',
 '4672',
 '7821',
 '9568',
 '0774',
 '1826',
 '6140',
 '7540',
 '4877',
 '7017',
 '7564',
 '0500',
 '0355']

In [1048]:
# Write the names of the French files to an output file
french_files_out = os.path.join(os.getcwd(), 'trigram_model_outputs', 'french_file_names.txt')
with open(french_files_out, 'w') as file:
    for file_name in french_text_file_names:
        line = str(file_name) + '\n'
        file.write(line)

## 2.4 Character-level Trigram Language Model with Add-k (Laplace) Smoothing

In [1058]:
# Create the trigram model using add-5 Laplace smoothing
trigram_add_5_model = {bigram: {unigram: 0.0 for unigram in unigram_counts.keys()} for bigram in bigram_counts.keys()}
    
for bigram in bigram_counts.keys():
    for unigram in unigram_counts.keys():
        trigram = bigram + unigram
        
        if bigram == unk_placeholder or unigram == unk_placeholder:
            trigram = unk_placeholder
        
        trigram_count = 0
        if trigram in trigram_counts.keys():
            trigram_count = trigram_counts[trigram]
            
        probability = (trigram_count + 5) / (bigram_counts[bigram] + (vocabulary_size * 5))
        trigram_add_5_model[bigram][unigram] = probability

In [1051]:
# Calculate the perplexities of all test texts using the laplace add-5 model
add_5_perplexities = {}
for index, text in enumerate(test_texts):
    test_file_name = test_files_names[index]
    add_5_perplexities[(index, test_file_name)] = calculatePerplexity(text, trigram_add_5_model, unigram_counts.keys(), bigram_counts.keys(), unk_placeholder)

In [1053]:
# Write the perplexity scores to an output file
add_5_perplexities_sorted = {file_name:count for file_name,count in sorted(add_5_perplexities.items(), key=lambda item: item[1], reverse=True)}

add_5_perplexities_out = os.path.join(os.getcwd(), 'trigram_model_outputs', 'add_5_perplexity_scores.txt')
with open(add_5_perplexities_out, 'w') as file:
    for file_name_tuple in add_5_perplexities_sorted.keys():
        line = str(file_name_tuple[1]) + ", " +  str(add_5_perplexities_sorted[file_name_tuple]) + '\n'
        file.write(line)

In [1054]:
# Get the 50 texts with the highest perplexity scores
add_5_highest_perplexities = {file_name:count for file_name,count in sorted(add_5_perplexities.items(), key=lambda item: item[1], reverse=True)[0:50]}
add_5_highest_perplexities

{(464, '4724'): 44.13951851490697,
 (930, '9420'): 43.96918197232693,
 (355, '3689'): 43.964552021170135,
 (58, '0597'): 43.935058293248474,
 (600, '6094'): 43.925177614057645,
 (78, '0807'): 43.86347721779008,
 (38, '0404'): 43.54013963209549,
 (310, '3182'): 43.4936251133351,
 (203, '2149'): 43.428800955913886,
 (746, '7501'): 43.366596654497556,
 (113, '1186'): 43.33648612927491,
 (727, '7384'): 43.31945284195853,
 (604, '6118'): 43.3049784123911,
 (2, '0028'): 43.21236158574669,
 (186, '1915'): 43.07342963242527,
 (882, '8853'): 43.060183044024775,
 (267, '2769'): 43.059497137710196,
 (900, '8978'): 43.036107059174064,
 (409, '4208'): 42.946170700288185,
 (814, '8250'): 42.90231230639869,
 (933, '9448'): 42.893702841200835,
 (336, '3441'): 42.77182932665183,
 (921, '9259'): 42.751747103123954,
 (324, '3329'): 42.62177780411885,
 (494, '5030'): 42.4738493748606,
 (340, '3492'): 42.430568259769075,
 (626, '6363'): 42.25135952707855,
 (735, '7419'): 42.215961597109455,
 (541, '5578'):

## Compare Add-1 and Add-5 Smoothing

In [1055]:
# Add 1 model stats
add_1_perplexities_avg = sum(add_1_perplexities.values()) / len(add_1_perplexities)
add_1_perplexities_max = max(add_1_perplexities.values())
add_1_perplexities_min = min(add_1_perplexities.values())
add_5_perplexities_avg = sum(add_5_perplexities.values()) / len(add_5_perplexities)
add_5_perplexities_max = max(add_5_perplexities.values())
add_5_perplexities_min = min(add_5_perplexities.values())

print('add 1 perplexities avg: {0:.2f}'.format(add_1_perplexities_avg))
print('add 1 perplexities max: {0:.2f}'.format(add_1_perplexities_max))
print('add 1 perplexities min: {0:.2f}'.format(add_1_perplexities_min))

add 1 perplexities avg: 12.43
add 1 perplexities max: 21.05
add 1 perplexities min: 10.73


In [1056]:
# Add 5 model stats
add_5_perplexities_avg = sum(add_5_perplexities.values()) / len(add_5_perplexities)
add_5_perplexities_max = max(add_5_perplexities.values())
add_5_perplexities_min = min(add_5_perplexities.values())

print('add 5 perplexities avg: {0:.2f}'.format(add_5_perplexities_avg))
print('add 5 perplexities max: {0:.2f}'.format(add_5_perplexities_max))
print('add 5 perplexities min: {0:.2f}'.format(add_5_perplexities_min))

add 5 perplexities avg: 24.05
add 5 perplexities max: 44.14
add 5 perplexities min: 20.67


As can be seen in the add-1 and add-5 model statistics above, the add-1 model has a lower average perplexity score than the add-5 model, indicating that the add-1 model more accurately predicts the trigram distributions of the test texts. While this is unintuitive--usually Laplace smoothing with a greater value of k moves less of the probability mass from very likely n-grams to very unlikely n-grams, producing lower perplexity scores for models with large numbers of zero/low probability cases--the way UNK values are handled in these models may have affected the results. The UNK count for the trigram training set counts is very high (19960 to be exact), and this causes large probabilities to be assigned to low probability trigrams. This especially affects the perplexity scoring for the French language texts.