<h1>**IBM Model 1**</h1>

1. a) Implement EM training (Brown et al., 1993) for IBM model 1; <br />
    b) Implement variational inference for Bayesian IBM model 1; <br />
    c) All of the tasks below should be performed for both models.<br />
2. Plot the evolution of training log likelihood (or ELBO) as a function of the iteration.
3. Plot the evolution of alignment error rate (AER) on validation data as a function of the iteration;
4. Experiment with two criteria for model selection (i.e. deciding on number of training iterations): 
    1) convergence in terms of training log likelihood; 
    2) best AER on validation data;
5. For the selected models, obtain Viterbi alignments for every sentence pair in a test
corpus and compute AER using a gold-standard provided by the assistant;

In [10]:
import aer
from collections import defaultdict, Counter
from math import log2
import numpy as np
from random import randint
import progressbar
import matplotlib.pyplot as plt

In [23]:
def read_corpus(file_name, source_language):
    """
    Reads the corpus and saves each sentence in a list.
    """
    
    corpus = []
    
    with open(file_name, "r", encoding='utf8') as f:
        for line in f:
            line = line.replace("\n", "")
            sentence = line.split()
            
            if source_language:
                sentence.insert(0, "null")
            corpus.append(sentence)
    return corpus[:5000]


def reduce_corpus(corpus):
    """
    Reduces the corpus such that words that only occur once are replaced
    by -LOW-.
    """
    
    flat_corpus = [word for sentence in corpus for word in sentence]
    word_counts = Counter(flat_corpus)
    small_corpus = []
    
    for sentence in corpus:
        small_sentence = []
        
        for word in sentence:
            if word_counts[word] != 1:
                small_sentence.append(word)
            else:
                small_sentence.append("-LOW-")
        small_corpus.append(small_sentence)
    return small_corpus


train_source_corpus = read_corpus("training/hansards.36.2.e", True)
train_source_corpus = reduce_corpus(train_source_corpus)
print("Read training source corpus")

train_target_corpus = read_corpus("training/hansards.36.2.f", False)
train_target_corpus = reduce_corpus(train_target_corpus)
print("Read training target corpus")

val_target_corpus = read_corpus("validation/dev.f", False)
val_source_corpus = read_corpus("validation/dev.e", True)
print("Read validation corpora")

Read training source corpus
Read training target corpus
Read validation corpora


In [24]:
def initialise_parameters(source_corpus, target_corpus):
    """
    Initialises the conditional probability of generating a source 
    word from a target word for all possible pairs of words in the source 
    and target sentences uniformly.
    """
    
    flat_corpus = [word for sentence in source_corpus for word in sentence]
    amount_source_words = len(set(flat_corpus))
    theta0 = 1/amount_source_words
    return defaultdict(lambda: defaultdict(lambda: theta0))


def expectation_maximisation(source_corpus, target_corpus, parameters, 
                             num_iterations, min_perplexity_change,
                             source_validation, target_validation):
    """
    Do the EM algorithm until perplexity decreases very little or until 
    the number of iterations is reached.
    """
    
    old_perplexity = -100000
    perplexities = []
    aers = []
    
    for k in range(0, num_iterations):
        print("Iteration #" + str(k), "out of", num_iterations - 1)
        counts_single = defaultdict(lambda: 1.0)
        counts_pairs = defaultdict(lambda: defaultdict(float))
        counts_single, counts_pairs = e_step(source_corpus, target_corpus,
                                             parameters, counts_single, 
                                             counts_pairs)
        parameters = m_step(parameters, counts_single, counts_pairs)
        print("Computing perplexity and AER...")
        perplexity = compute_perplexity(parameters, source_corpus, target_corpus)
        perplexities.append(perplexity)     
        alignments = get_best_alignment(source_validation, target_validation, parameters)
        val_aer = compute_aer(alignments)
        aers.append(val_aer)
        
        if abs(perplexity - old_perplexity) < min_perplexity_change:
            return perplexities, aers
        else:
            old_perplexity = perplexity
    return perplexities, aers
    
    
def e_step(source_corpus, target_corpus, parameters, counts_single, 
           counts_pairs):
    """
    Do the E-step by computing the expected counts.
    """
    
    print("Doing E-step...")
    
    with progressbar.ProgressBar(max_value=len(target_corpus)) as bar:
        for n in range(len(target_corpus)):
            target_sentence = target_corpus[n]
            source_sentence = source_corpus[n]

            for i in range(len(target_sentence)):
                normalisation_term = 0
                target_word = target_sentence[i]

                for j in range(len(source_sentence)):
                    source_word = source_sentence[j]
                    normalisation_term += parameters[source_word][target_word]
                for j in range(len(source_sentence)):
                    source_word = source_sentence[j]
                    expected_count = parameters[source_word][target_word]/normalisation_term
                    counts_pairs[source_word][target_word] += expected_count
                    counts_single[source_word] += expected_count
            bar.update(n)
    return counts_single, counts_pairs


def m_step(parameters, counts_single, counts_pairs):
    """
    Do the M-step by normalising the parameters.
    """
    
    print("Doing M-step...")
    for source_word, target_words in counts_pairs.items():
        for target_word, expected_count in target_words.items():
            parameters[source_word][target_word] = expected_count/counts_single[source_word]
    return parameters


def compute_perplexity(theta_dict, source_corpus, target_corpus):
    """
    Computes the perplexity of a corpus.
    """
    
    logprobs = []
    total_sum = 0
    
    for n in range(len(source_corpus)):
        english_sentence = source_corpus[n]
        french_sentence = target_corpus[n]
        french_sum = 0
        for j in range(len(french_sentence)): 
            f_j = french_sentence[j]
            log_sum = []
            for i in range(len(english_sentence)): 
                e_i = english_sentence[i]
                log_sum.append(theta_dict[e_i][f_j])
            french_sum += np.log(np.sum(log_sum))
        total_sum += french_sum
    perplexity = total_sum
    print(perplexity)
    return perplexity
   
    
def get_best_alignment(source_corpus, target_corpus, parameters):
    """
    Gets the best alignment for each sentence and saves the alignment
    in a list of lists that holds tuples for each position in the sentence
    and looks as follows:
    (sentence_index, target_word_index, source_word_index).
    """
    
    alignments = []
    
    for n in range(len(source_corpus)):
        source_sentence = source_corpus[n]
        target_sentence = target_corpus[n]
        alignment = []
        
        for i in range(len(target_sentence)):
            target_word = target_sentence[i]
            best_prob = 0
            best_j = 0
            
            for j in range(len(source_sentence)):
                source_word = source_sentence[j]
                prob = parameters[source_word][target_word]
                
                if prob > best_prob:
                    best_prob = prob
                    best_j = j
                    
            if best_j != 0:    
                alignment.append((n, best_j, i+1))
        alignments.append(alignment)
    return alignments


def compute_aer(predictions):
    """
    Computes the Alignment Error Rate.
    """
    
    gold_sets = aer.read_naacl_alignments("validation/dev.wa.nonullalign")
    metric = aer.AERSufficientStatistics()
    
    for gold, prediction in zip(gold_sets, predictions):
        prediction = set([(alignment[1], alignment[2]) for alignment in prediction])
        metric.update(sure=gold[0], probable=gold[1], predicted=prediction)
    print(metric.aer())
    return metric.aer()


print("Initialising parameters...")
initial_params = initialise_parameters(train_source_corpus, train_target_corpus)
perplexity, val_aer = expectation_maximisation(train_source_corpus, train_target_corpus,  
                                               initial_params, 20, 50, val_source_corpus, 
                                               val_target_corpus)

  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:03

Initialising parameters...
Iteration #0 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:03

-155467.88814758134
0.4916839916839917
Iteration #1 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (185 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:02

-100444.68769480342
0.43664921465968587
Iteration #2 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-78012.46636739746
0.44397905759162304
Iteration #3 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-68811.3486987745
0.4513089005235602
Iteration #4 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-64464.938751727736
0.4544502617801047
Iteration #5 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  5% (254 of 5000) |#                     | Elapsed Time: 0:00:00 ETA:  0:00:02

-62088.381782451834
0.45750262329485836
Iteration #6 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-60643.30757849578
0.41218274111675124
Iteration #7 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-59697.04386009395
0.41218274111675124
Iteration #8 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-59043.60215483976
0.41624365482233505
Iteration #9 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:02

-58574.12695408138
0.41725888324873095
Iteration #10 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-58225.74338206119
0.4164133738601824
Iteration #11 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:00

-57959.833885916705
0.4168356997971603
Iteration #12 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  5% (254 of 5000) |#                     | Elapsed Time: 0:00:00 ETA:  0:00:02

-57752.01775346732
0.4168356997971603
Iteration #13 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  5% (254 of 5000) |#                     | Elapsed Time: 0:00:00 ETA:  0:00:02

-57586.49807312245
0.4168356997971603
Iteration #14 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  5% (254 of 5000) |#                     | Elapsed Time: 0:00:00 ETA:  0:00:02

-57452.6810156507
0.41987829614604466
Iteration #15 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:03

-57343.17549134532
0.42233502538071066
Iteration #16 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  3% (190 of 5000) |                      | Elapsed Time: 0:00:00 ETA:  0:00:02

-57252.59690109533
0.42335025380710656
Iteration #17 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  5% (254 of 5000) |#                     | Elapsed Time: 0:00:00 ETA:  0:00:02

-57176.94737131792
0.42421159715157686
Iteration #18 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...


  5% (254 of 5000) |#                     | Elapsed Time: 0:00:00 ETA:  0:00:02

-57113.19155107973
0.4252288911495422
Iteration #19 out of 19
Doing E-step...


100% (5000 of 5000) |#####################| Elapsed Time: 0:00:03 Time: 0:00:03


Doing M-step...
Computing perplexity and AER...
-57058.98158463248
0.4252288911495422


In [26]:
%matplotlib tk
plt.plot(np.arange(1, len(val_aer)+1), val_aer)
plt.title("Evolution of AER")
plt.xlabel("Iterations")
plt.ylabel("AER")
plt.show()

# %matplotlib tk
# plt.plot(np.arange(1, len(perplexity)+1), perplexity)
# plt.title("Evolution of Perplexity")
# plt.xlabel("Iterations")
# plt.ylabel("Perplexity")
# plt.show()

<h1>IBM Model 2</h1>

In [None]:
iterations = 10
sub = 1


def expectation_maximisation2(source_corpus, target_corpus, parameters, num_iterations, min_perplexity_change):
    q = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.1))))
    old_perplexity = -100000
    
    for k in range(0, num_iterations):
        print("Iteration #" + str(k), "out of", num_iterations - 1)
        
        counts_pairs = defaultdict(lambda: defaultdict(lambda: 0.))
        counts_alignments = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.))))
        counts_single = defaultdict(lambda: 0.)
        counts_pairs, counts_single, counts_alignments = e_step2(source_corpus, target_corpus, counts_pairs, counts_single, counts_alignments, q)
        parameters, q = m_step2(parameters, q, counts_alignments, counts_pairs, counts_single)
        perplexity = compute_perplexity(parameters, source_corpus, target_corpus)
        print(perplexity)
        
        if abs(perplexity - old_perplexity) < min_perplexity_change:
            return parameters
        else:
            old_perplexity = perplexity
    return parameters        


def e_step2(source_corpus, target_corpus, counts_pairs, counts_single, counts_alignments, q):
    for n in range(len(source_corpus)):
        source_sentence = source_corpus[n]
        target_sentence = target_corpus[n]
        l = len(source_sentence)
        m = len(target_sentence)

        for i, target_word in enumerate(target_sentence):
            delta_denominator = sum([q[j_k][i][l][m]*parameters[source_sentence[j_k]][target_word] for j_k in range(l)])

            for j, source_word in enumerate(source_sentence):
                delta = (q[j][i][l][m]*parameters[source_word][target_word]) / delta_denominator

                counts_pairs[source_word][target_word] += delta
                counts_single[source_word] += delta
                counts_alignments[l][m][i][j] += delta
    return counts_pairs, counts_single, counts_alignments


def m_step2(parameters, q, counts_alignments, counts_pairs, counts_single):
    for j in q.keys():
        for i in q[j].keys():
            for l in q[j][i].keys():
                for m in q[j][i][l].keys():
                    q[j][i][l][m] = counts_alignments[l][m][i][j] / sum(counts_alignments[l][m][i].values())
    
    for source_word, target_words in parameters.items():
        for target_word in target_words:
            parameters[source_word][target_word] = counts_pairs[source_word][target_word]/counts_single[source_word]
    return parameters, q


parameters = initialise_parameters(train_source_corpus, train_target_corpus)
parameters = expectation_maximisation2(train_source_corpus, train_target_corpus, parameters, 
                                       10, 5)
alignments = get_best_alignment(val_source_corpus, val_target_corpus, parameters)
compute_aer(alignments)

In [None]:
plt.plot(perps)
plt.show()

# Variational Bayes IBM1

In [None]:


theta_dict =  defaultdict(lambda: defaultdict(lambda: 0.02))
lambda_dict = defaultdict(lambda: defaultdict(lambda: 0.5))
alpha = 0.4 

for s in range(iterations):
    
    print('iteration', s)
    # initialize all counts to 0
    
    counts_pairs = defaultdict(lambda: defaultdict(lambda: 0.))
    counts_alignments = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.))))
    counts_single = defaultdict(lambda: 0.)
    
    for n in range(len(train_source_corpus)):
        source_sentence = train_source_corpus[n]
        target_sentence = train_target_corpus[n]

        for i, target_word in enumerate(target_sentence):
            
             
                
    for j in q.keys():
        for i in q[j].keys():
            for l in q[j][i].keys():
                for m in q[j][i][l].keys():
                    q[j][i][l][m] = counts_alignments[l][m][i][j] / sum(counts_alignments[l][m][i].values())
    
    for source_word, target_words in parameters.items():
        for target_word in target_words:
            theta_dict[source_word][target_word] = counts_pairs[source_word][target_word]/counts_single[source_word]
                
    perp = compute_perplexity(parameters, train_source_corpus, train_target_corpus)
    perps.append(perp)

In [None]:
from scipy.special import digamma 

In [None]:
digamma([1, 2, 3])