In [8]:
# ============================================================
# BLOCK 1: INSTALL REQUIRED LIBRARIES
# This installs dataset loading and BLEU evaluation tools.
# ============================================================
!pip install datasets nltk sacrebleu -q


In [9]:
# ============================================================
# BLOCK 2: IMPORT REQUIRED LIBRARIES
# ============================================================
import math
from collections import defaultdict, Counter
from datasets import load_dataset
import sacrebleu


In [10]:
# ============================================================
# BLOCK 3: LOAD DATASET AND CREATE 80/20 SPLIT
# ============================================================
dataset = load_dataset("bentrevett/multi30k")

split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

train_data = split_dataset["train"]
test_data = split_dataset["test"]


In [11]:
# ============================================================
# BLOCK 4: DEFINE SIMPLE TOKENIZER
# This splits sentences by whitespace and converts to lowercase.
# ============================================================
def tokenize(sentence):
    return sentence.lower().split()


In [13]:
# ============================================================
# BLOCK 5: BUILD HMM COUNT TABLES
# ============================================================
transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
state_counts = Counter()

train_limit = min(5000, len(train_data))

for i in range(train_limit):

    example = train_data[i]

    en_tokens = tokenize(example["en"])
    de_tokens = tokenize(example["de"])

    min_len = min(len(en_tokens), len(de_tokens))

    for j in range(min_len):
        state = de_tokens[j]
        obs = en_tokens[j]

        emission_counts[state][obs] += 1
        state_counts[state] += 1

        if j > 0:
            prev_state = de_tokens[j-1]
            transition_counts[prev_state][state] += 1


In [14]:
# ============================================================
# BLOCK 6: CONVERT COUNTS TO PROBABILITIES
# Using Laplace smoothing to avoid zero probabilities
# ============================================================
transition_prob = {}
emission_prob = {}

vocab_size = len(state_counts)

# Transition probabilities
for prev_state in transition_counts:
    total = sum(transition_counts[prev_state].values())
    transition_prob[prev_state] = {
        state: (count + 1) / (total + vocab_size)
        for state, count in transition_counts[prev_state].items()
    }

# Emission probabilities
for state in emission_counts:
    total = state_counts[state]
    emission_prob[state] = {
        obs: (count + 1) / (total + vocab_size)
        for obs, count in emission_counts[state].items()
    }


In [15]:
# ============================================================
# BLOCK 7: DEFINE TRANSLATION FUNCTION
# ============================================================
def translate_sentence(en_sentence):
    en_tokens = tokenize(en_sentence)
    translated = []

    for word in en_tokens:
        best_state = None
        best_prob = 0

        for state in emission_prob:
            prob = emission_prob[state].get(word, 0)
            if prob > best_prob:
                best_prob = prob
                best_state = state

        if best_state:
            translated.append(best_state)
        else:
            translated.append("<unk>")

    return " ".join(translated)


In [18]:
# ============================================================
# BLOCK 8: GENERATE PREDICTIONS
# ============================================================
predictions = []
references = []

test_limit = min(500, len(test_data))

for i in range(test_limit):

    example = test_data[i]   # âœ… Always returns dictionary

    pred = translate_sentence(example["en"])
    ref = example["de"]

    predictions.append(pred)
    references.append([ref])



In [20]:
# ============================================================
# BLOCK 9: COMPUTE BLEU SCORE
# ============================================================
import sacrebleu

# Make sure predictions and references are not empty
if len(predictions) == 0:
    print("No predictions generated.")
else:
    bleu = sacrebleu.corpus_bleu(
        predictions,
        [ [ref[0] for ref in references] ]  # correct format
    )

    print("BLEU Score:", bleu.score)


BLEU Score: 0.6706746022173007


In [19]:
# ============================================================
# BLOCK 10: COMPUTE LOSS AND PERPLEXITY
# ============================================================
log_likelihood = 0
total_words = 0

for pred, ref in zip(predictions, references):

    ref_tokens = tokenize(ref[0])
    pred_tokens = tokenize(pred)

    min_len = min(len(ref_tokens), len(pred_tokens))

    for i in range(min_len):
        state = ref_tokens[i]
        obs = pred_tokens[i]

        prob = emission_prob.get(state, {}).get(obs, 1e-6)
        log_likelihood += math.log(prob)
        total_words += 1

loss = -log_likelihood / total_words
perplexity = math.exp(loss)

print("Loss:", loss)
print("Perplexity:", perplexity)


Loss: 13.294003644627962
Perplexity: 593625.3317038155
