1. Load and Preprocess the Corpus

In [80]:
import re

# Function to clean text
def clean_text(text):
    # Remove citations like [1], [citation needed]
    text = re.sub(r'\[[^\]]+\]', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove parenthetical content
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove non-alphanumeric characters except periods
    text = re.sub(r'[^a-zA-Z0-9\s\.]', '', text)
    # Remove standalone numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    # Ensure proper sentence spacing (e.g., "word.word" -> "word. word")
    text = re.sub(r'\.([a-zA-Z])', r'. \1', text)
    return text.lower()  # Convert to lowercase

# Load the raw corpus
with open('./datasets/angkorwat.txt', 'r') as file:
    raw_corpus = file.read()

# Clean the corpus
cleaned_corpus = clean_text(raw_corpus)

# Save the cleaned corpus for inspection (optional)
with open('./datasets/cleaned_angkorwat.txt', 'w') as file:
    file.write(cleaned_corpus)


2. Split the Corpus into Train, Validation, and Test Sets

In [81]:
import random

# Split the text into sentences
sentences = cleaned_corpus.split('.')
# Remove empty sentences and trim whitespace
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Shuffle the sentences to avoid bias
random.shuffle(sentences)

# Calculate split indices
train_split = int(0.7 * len(sentences))
val_split = int(0.8 * len(sentences))

# Create subsets
train_set = sentences[:train_split]
val_set = sentences[train_split:val_split]
test_set = sentences[val_split:]

# Print information about the splits
print(f"Total sentences: {len(sentences)}")
print(f"Training set: {len(train_set)}")
print(f"Validation set: {len(val_set)}")
print(f"Testing set: {len(test_set)}")


Total sentences: 71
Training set: 49
Validation set: 7
Testing set: 15


3.Tokenize Sentences and Limit Vocabulary

In [82]:
from nltk.tokenize import word_tokenize
from collections import Counter

# Step 1: Tokenize each sentence in the training set
train_tokens = [word_tokenize(sentence) for sentence in train_set]

# Check if tokenization is successful
if len(train_tokens) == 0:
    print("Error: `train_tokens` is empty. Check `train_set`.")
else:
    print(f"Number of tokenized sentences in train_set: {len(train_tokens)}")

# Step 2: Define the vocabulary size
vocab_size = 20000

# Step 3: Count word frequencies in the training set
word_counts = Counter(word for sentence in train_tokens for word in sentence)

# Step 4: Select the top `vocab_size` words as the vocabulary
vocab = {word for word, _ in word_counts.most_common(vocab_size)}
print(f"Vocabulary size (top {vocab_size} words): {len(vocab)}")

# Step 5: Function to replace words not in the vocabulary with <UNK>
def replace_with_unk(sentences, vocab):
    return [
        [word if word in vocab else '<UNK>' for word in word_tokenize(sentence)]
        for sentence in sentences
    ]

# Step 6: Apply the function to train, validation, and test sets
train_tokens = replace_with_unk(train_set, vocab)
val_tokens = replace_with_unk(val_set, vocab)
test_tokens = replace_with_unk(test_set, vocab)

# Check the length of `train_tokens` after replacement
if len(train_tokens) == 0:
    print("Error: `train_tokens` is empty after replacement. Check `replace_with_unk` function.")
else:
    print(f"Number of sentences in train_tokens after replacement: {len(train_tokens)}")

# Step 7: Print the first 5 tokenized sentences to check results
print("First 5 tokenized training sentences with <UNK>:")
for i in range(min(5, len(train_tokens))):  # Handle cases where there are fewer than 5 sentences
    print(train_tokens[i])


Number of tokenized sentences in train_set: 49
Vocabulary size (top 20000 words): 568
Number of sentences in train_tokens after replacement: 49
First 5 tokenized training sentences with <UNK>:
['the', 'alternate', 'name', 'yasodharapura', 'was', 'derived', 'from', 'the', 'name', 'of', 'the', 'foster', 'mother', 'of', 'lord', 'krishna', 'in', 'hinduism']
['because', 'of', 'the', 'lowdensity', 'and', 'dispersed', 'nature', 'of', 'the', 'medieval', 'khmer', 'settlement', 'pattern', 'angkor', 'lacks', 'a', 'formal', 'boundary', 'and', 'its', 'extent', 'is', 'therefore', 'difficult', 'to', 'determine']
['over', 'the', 'ruins', 'of', 'yasodharapura', 'jayavarman', 'constructed', 'the', 'walled', 'city', 'of', 'angkor', 'thom', 'as', 'well', 'as', 'its', 'geographic', 'and', 'spiritual', 'center', 'the', 'temple', 'known', 'as', 'the', 'bayon']
['a', 'khmer', 'rebellion', 'against', 'siamese', 'authority', 'resulted', 'in', 'the', 'sacking', 'of', 'angkor', 'by', 'ayutthaya', 'causing', 'its'

4. Build 4-Gram Models

In [83]:
from nltk.util import ngrams
from collections import defaultdict

# Function to build the n-gram model
def build_ngram_model(tokens, n):
    model = defaultdict(lambda: defaultdict(int))
    for sentence in tokens:
        for ngram in ngrams(sentence, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'):
            prefix, token = ngram[:-1], ngram[-1]
            model[prefix][token] += 1
    return model

# Build models using 4-grams
lm1 = build_ngram_model(train_tokens, 4)  # Backoff model
lm2 = build_ngram_model(train_tokens, 4)  # Interpolation model


5. Define Probability Functions

In [84]:
# Backoff Probability
def backoff_prob(model, ngram):
    prefix, token = ngram[:-1], ngram[-1]
    if prefix in model and token in model[prefix]:
        return model[prefix][token] / sum(model[prefix].values())
    elif len(prefix) > 1:
        return backoff_prob(model, prefix[1:] + (token,))
    return 1e-10  # Small positive probability

# Interpolation Probability
def interpolated_prob(model, ngram, lambdas, k):
    prob = 0
    n = len(ngram)
    for i in range(1, n + 1):
        sub_ngram = ngram[-i:]
        prefix, token = sub_ngram[:-1], sub_ngram[-1]
        count = model[prefix][token] + k
        total_count = sum(model[prefix].values()) + k * len(model[prefix])
        prob += lambdas[i - 1] * (count / total_count)
    return prob


6. Evaluate Models with Perplexity

In [85]:
import math

# Perplexity Calculation
def calculate_perplexity(model, tokens, prob_func):
    log_prob_sum = 0
    word_count = 0
    for sentence in tokens:
        for ngram in ngrams(sentence, 4, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'):
            prob = prob_func(model, ngram)
            log_prob_sum += -math.log2(prob)
            word_count += 1
    return 2 ** (log_prob_sum / word_count)

# Hyperparameters for interpolation
lambdas = [0.1, 0.2, 0.3, 0.4]
k = 1

# Calculate perplexity for both models
pp_lm1 = calculate_perplexity(lm1, test_tokens, lambda m, n: backoff_prob(m, n))
pp_lm2 = calculate_perplexity(lm2, test_tokens, lambda m, n: interpolated_prob(m, n, lambdas, k))

print(f"Perplexity of LM1 (Backoff): {pp_lm1}")
print(f"Perplexity of LM2 (Interpolation): {pp_lm2}")


Perplexity of LM1 (Backoff): 4576457536.867848
Perplexity of LM2 (Interpolation): 1.761733184278214


7. Generate Text

In [86]:
# Text Generation
def generate_text(model, start_tokens, max_length, prob_func):
    text = start_tokens
    for _ in range(max_length - len(start_tokens)):
        prefix = tuple(text[-3:])
        if prefix not in model:
            break
        next_word = max(model[prefix], key=lambda word: prob_func(model, prefix + (word,)))
        text.append(next_word)
        if next_word == '</s>':
            break
    return ' '.join(text)

# Generate text for both models
start = ['<s>', '<s>', 'the']
print("Generated Text LM1:", generate_text(lm1, start, 100, backoff_prob))
print("Generated Text LM2:", generate_text(lm2, start, 100, lambda m, n: interpolated_prob(m, n, lambdas, k)))


Generated Text LM1: <s> <s> the alternate name yasodharapura was derived from the name of the foster mother of lord krishna in hinduism </s>
Generated Text LM2: <s> <s> the alternate name yasodharapura was derived from the name of the foster mother of lord krishna in hinduism </s> </s>
