**1. Preprocessing**


*   Remove punctuation and digits
*   Add sentence markers tags to mark sentence boundaries
*   Convert all tokens to lowercase

In [2]:
import re
from collections import Counter, defaultdict

def preprocess(text):
    # Remove punctuation and digits, add sentence markers
    text = re.sub(r'[^\w\s]', '', text)
    text = "<s> " + text.lower() + " </s>"
    return text.split()

def load_data(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
    return [preprocess(line.strip()) for line in lines]

train_data = load_data('/train.txt')
validation_data = load_data('/val.txt')

# Check first few processed sentences
print(train_data[:5])


[['<s>', 'i', 'booked', 'two', 'rooms', 'four', 'months', 'in', 'advance', 'at', 'the', 'talbott', 'we', 'were', 'placed', 'on', 'the', 'top', 'floor', 'next', 'to', 'the', 'elevators', 'which', 'are', 'used', 'all', 'night', 'long', 'when', 'speaking', 'to', 'the', 'front', 'desk', 'i', 'was', 'told', 'that', 'they', 'were', 'simply', 'honoring', 'my', 'request', 'for', 'an', 'upper', 'floor', 'which', 'i', 'had', 'requested', 'for', 'a', 'better', 'view', 'i', 'am', 'looking', 'at', 'a', 'brick', 'wall', 'and', 'getting', 'no', 'sleep', 'he', 'also', 'told', 'me', 'that', 'they', 'had', 'received', 'complaints', 'before', 'from', 'guests', 'on', 'the', '16th', 'floor', 'and', 'were', 'aware', 'of', 'the', 'noise', 'problem', 'why', 'then', 'did', 'they', 'place', 'us', 'on', 'this', 'floor', 'when', 'the', 'hotel', 'is', 'not', 'totally', 'booked', 'a', 'request', 'for', 'an', 'upper', 'floor', 'does', 'not', 'constitute', 'placing', 'someone', 'on', 'the', 'top', 'floor', 'and', 'us

**2. Handling Unknown Words**

In [3]:
def handle_unknowns(corpus, k=1):
    # Count frequencies of tokens
    counts = Counter([word for sentence in corpus for word in sentence])
    # Replace words with frequency <= k with <UNK>
    return [[word if counts[word] > k else '<UNK>' for word in sentence] for sentence in corpus]

# Apply unknown word handling
train_data = handle_unknowns(train_data)
validation_data = handle_unknowns(validation_data)

# Check first few processed sentences with <UNK>
print(train_data[:5])


[['<s>', 'i', 'booked', 'two', 'rooms', 'four', 'months', 'in', 'advance', 'at', 'the', 'talbott', 'we', 'were', 'placed', 'on', 'the', 'top', 'floor', 'next', 'to', 'the', 'elevators', 'which', 'are', 'used', 'all', 'night', 'long', 'when', 'speaking', 'to', 'the', 'front', 'desk', 'i', 'was', 'told', 'that', 'they', 'were', 'simply', '<UNK>', 'my', 'request', 'for', 'an', 'upper', 'floor', 'which', 'i', 'had', 'requested', 'for', 'a', 'better', 'view', 'i', 'am', 'looking', 'at', 'a', 'brick', 'wall', 'and', 'getting', 'no', 'sleep', 'he', 'also', 'told', 'me', 'that', 'they', 'had', 'received', 'complaints', 'before', 'from', 'guests', 'on', 'the', '16th', 'floor', 'and', 'were', 'aware', 'of', 'the', 'noise', 'problem', 'why', 'then', 'did', 'they', 'place', 'us', 'on', 'this', 'floor', 'when', 'the', 'hotel', 'is', 'not', 'totally', 'booked', 'a', 'request', 'for', 'an', 'upper', 'floor', 'does', 'not', '<UNK>', '<UNK>', 'someone', 'on', 'the', 'top', 'floor', 'and', 'using', 'tha

**3. Unigram and Bigram Frequency Calculation**

In [4]:
def count_ngrams(corpus, n):
    ngrams = defaultdict(int)
    for sentence in corpus:
        for i in range(len(sentence)):
            # Unigram
            if n == 1:
                ngrams[sentence[i]] += 1
            # Bigram
            if n == 2 and i > 0:
                ngrams[(sentence[i-1], sentence[i])] += 1
    return ngrams

# Calculate unigram and bigram frequencies
unigrams = count_ngrams(train_data, 1)
bigrams = count_ngrams(train_data, 2)

# Check unigram and bigram counts
print(list(unigrams.items())[:5])
print(list(bigrams.items())[:5])


[('<s>', 512), ('i', 1711), ('booked', 86), ('two', 128), ('rooms', 202)]
[(('<s>', 'i'), 111), (('i', 'booked'), 21), (('booked', 'two'), 1), (('two', 'rooms'), 3), (('rooms', 'four'), 1)]


**4. Calculate Unigram and Bigram Probabilities**

In [5]:
def calculate_unigram_probabilities(unigrams):
    total_count = sum(unigrams.values())
    return {word: count / total_count for word, count in unigrams.items()}

def calculate_bigram_probabilities(bigrams, unigrams):
    return {pair: count / unigrams[pair[0]] for pair, count in bigrams.items()}

# Calculate probabilities
unigram_probs = calculate_unigram_probabilities(unigrams)
bigram_probs = calculate_bigram_probabilities(bigrams, unigrams)

# Check probabilities
print(list(unigram_probs.items())[:5])
print(list(bigram_probs.items())[:5])


[('<s>', 0.006366337987889037), ('i', 0.02127500839312137), ('booked', 0.0010693458339032366), ('two', 0.0015915844969722592), ('rooms', 0.0025117192842843464)]
[(('<s>', 'i'), 0.216796875), (('i', 'booked'), 0.012273524254821741), (('booked', 'two'), 0.011627906976744186), (('two', 'rooms'), 0.0234375), (('rooms', 'four'), 0.0049504950495049506)]


**5. Laplace and Add-k Smoothing**

In [6]:
# Laplace smoothing
def laplace_smoothing(bigrams, unigrams, vocab_size):
    smoothed_probs = defaultdict(float)
    for (w1, w2), count in bigrams.items():
        smoothed_probs[(w1, w2)] = (count + 1) / (unigrams[w1] + vocab_size)
    return smoothed_probs

# Add-k smoothing
def add_k_smoothing(bigrams, unigrams, vocab_size, k=1):
    smoothed_probs = defaultdict(float)
    for (w1, w2), count in bigrams.items():
        smoothed_probs[(w1, w2)] = (count + k) / (unigrams[w1] + k * vocab_size)
    return smoothed_probs

vocab_size = len(unigram_probs)
laplace_probs = laplace_smoothing(bigrams, unigrams, vocab_size)
add_k_probs = add_k_smoothing(bigrams, unigrams, vocab_size, k=0.5)

# Check smoothed probabilities
print(list(laplace_probs.items())[:5])
print(list(add_k_probs.items())[:5])


[(('<s>', 'i'), 0.031189083820662766), (('i', 'booked'), 0.004592901878914405), (('booked', 'two'), 0.000631911532385466), (('two', 'rooms'), 0.0012472715933894605), (('rooms', 'four'), 0.0006095702529716549)]
[(('<s>', 'i'), 0.054350475262003414), (('i', 'booked'), 0.006614367020458391), (('booked', 'two'), 0.0009227929867733005), (('two', 'rooms'), 0.002098950524737631), (('rooms', 'four'), 0.0008613264427217916)]


**6. Perplexity Calculation**

In [7]:
import math

def calculate_perplexity(test_data, ngram_probs, n):
    log_prob_sum = 0
    N = 0  # Total number of tokens

    for sentence in test_data:
        for i in range(1, len(sentence)):
            N += 1
            if n == 1:
                prob = ngram_probs.get(sentence[i], ngram_probs.get('<UNK>', 1e-6))  # Unigram
            elif n == 2:
                prob = ngram_probs.get((sentence[i-1], sentence[i]), ngram_probs.get(('<UNK>', '<UNK>'), 1e-6))  # Bigram
            log_prob_sum += math.log(prob)  # Log base 2

    # Applying the perplexity formula
    perplexity = math.exp(-log_prob_sum / N)
    return perplexity

# Calculate perplexity for Laplace and Add-k smoothing
perplexity_laplace = calculate_perplexity(validation_data, laplace_probs, 2)
perplexity_add_k = calculate_perplexity(validation_data, add_k_probs, 2)

print("Perplexity with Laplace smoothing:", perplexity_laplace)
print("Perplexity with Add-k smoothing:", perplexity_add_k)



Perplexity with Laplace smoothing: 156.91966162970678
Perplexity with Add-k smoothing: 106.69400520161689
