In [1]:
import pandas as pd
import numpy as np

### a) Warmup

In [2]:
from collections import Counter
en_de = open("europarl-v7.de-en.lc.en", "r", encoding="utf-8").read()
en_fr = open("europarl-v7.fr-en.lc.en", "r", encoding="utf-8").read()
en_sv = open("europarl-v7.sv-en.lc.en", "r", encoding="utf-8").read()
en = en_de + " " + en_fr + " " + en_sv

de = open("europarl-v7.de-en.lc.de", "r", encoding="utf-8").read()
fr = open("europarl-v7.fr-en.lc.fr", "r", encoding="utf-8").read()
sv = open("europarl-v7.sv-en.lc.sv", "r", encoding="utf-8").read()

all_ = en + " " + de + " " + fr + " " + sv

In [3]:
type(en)

str

In [806]:
Counter(en.split()).most_common(12)

[('the', 58790),
 (',', 42043),
 ('.', 29542),
 ('of', 28406),
 ('to', 26842),
 ('and', 21459),
 ('in', 18485),
 ('is', 13331),
 ('that', 13219),
 ('a', 13090),
 ('we', 9936),
 ('this', 9916)]

In [807]:
Counter(de.split()).most_common(12)

[(',', 18549),
 ('die', 10521),
 ('.', 9733),
 ('der', 9374),
 ('und', 7028),
 ('in', 4175),
 ('zu', 3168),
 ('den', 2976),
 ('wir', 2863),
 ('daß', 2738),
 ('ich', 2670),
 ('das', 2669)]

In [808]:
Counter(fr.split()).most_common(13)

[('&apos;', 16729),
 (',', 15402),
 ('de', 14520),
 ('la', 9746),
 ('.', 9734),
 ('et', 6619),
 ('l', 6536),
 ('le', 6174),
 ('les', 5585),
 ('à', 5500),
 ('des', 5232),
 ('que', 4797),
 ('d', 4555)]

In [809]:
Counter(sv.split()).most_common(13)

[('.', 9648),
 ('att', 9181),
 (',', 8876),
 ('och', 7038),
 ('i', 5949),
 ('det', 5687),
 ('som', 5028),
 ('för', 4959),
 ('av', 4013),
 ('är', 3840),
 ('en', 3724),
 ('vi', 3211),
 ('jag', 3093)]

In [814]:
all_counter = Counter(all_.split())

The probabilities are found by taking the frequency of a given word and divide by the total number of words:

In [815]:
all_counter["speaker"]/sum(all_counter.values())

1.9327394942430718e-05

In [816]:
all_counter["zebra"]/sum(all_counter.values())

0.0

In [28]:
sentences = nltk.tokenize.sent_tokenize(all_)

In [29]:
sentences[0]

'i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .'

### b) Language modeling

If a word did not exist in the training data, its probability will be 0. This can be fixed by using laplace smoothing. This corresponds to initializing the frequency of all words to 1. A long sentence will lead to multiplying many small numbers together which will become very small. A solution is to us the log-probabilities instead.

In [3]:
import nltk
from collections import Counter, defaultdict
import math

def train_model(data):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Split data into sentences
    sentences = data.splitlines()

    # For each sentence, compute frequency of w2 arriving after w1 
    for sentence in sentences:
        for w1, w2 in nltk.bigrams(sentence.split()[:-1], pad_right=True, pad_left=True):
            model[w1][w2] += 1
    return model

In [4]:
model = train_model(en_sv)
def language_model(sentence, model):
    words = sentence.split()
    start_count = float(sum(model[None].values()))
    prob = math.log( (model[None][words[0]] + 0.001) / (start_count+len(model[None])*0.001)) # laplace lambda smoothing on first word
    #prob = math.log( (model[None][words[0]]) / (start_count)) # laplace lambda smoothing on first word
    #prob = (model[None][words[0]]) / (start_count)
    for w1, w2 in nltk.bigrams(words, pad_right=False, pad_left=False):
        #print(w1,w2)
        #print(total_count)
        prob_now = 0
        total_count = float(sum(model[w1].values()))
        p = (model[w1][w2]+0.001) / (total_count+len(model[w1])*0.001) # laplace lambda smoothing
        #p = (model[w1][w2]) / (total_count) # laplace lambda smoothing
        #model[w1][w2] = (model[w1][w2]) / (total_count) # laplace lambda smoothing
        prob = prob + math.log(p) # Log probabilities
        #prob = prob * model[w1][w2]
    return prob

In [6]:
math.exp(language_model("our member states", model))

2.002361613074967e-05

In [7]:
math.exp(language_model("member our states", model))

1.4234695149217394e-12

We can now compute $P(E)$

### c) Translation modeling

In [35]:
#def run_em(lang1, lang2, n):
import collections
lang1 = sv
lang2 = en_sv
#lang1 = en_sv
#lang2 = sv
n = 100

n_words = len(set(lang2.split()))
lang1 = lang1.splitlines()
lang2 = lang2.splitlines()
#t = collections.defaultdict(lambda: 1/n_words)
t = defaultdict(lambda: defaultdict(lambda: 1/n_words))

# E/M algorithm
for i in range(n):
    corpus = zip(lang1, lang2)
    count = collections.defaultdict(float)
    total = collections.defaultdict(float)
    s_total = collections.defaultdict(float)
    for (l1, l2) in corpus:
        # compute normalization

        l1 = l1.split()
        l2 = l2.split()
        # Insert null word at start
        l2.insert(0,"NULL")
        #print(l1)

        for f in l1:
            s_total[f] = 0.0
            for e in l2:
                s_total[f] += t[f][e]

        for f in l1:
            for e in l2:
                delta = t[f][e] / s_total[f]
                count[(e, f)] += delta
                total[e] += delta

    # estimate probability
    for (f, e) in count.keys():
        #if count[(f, e)] == 0:
            #print(f, e, count[(f, e)])
        t[f][e] = count[(f, e)] / total[e]

    #print(Counter(t["europeiska"]).most_common(10))
    # Find the words that are most likely to align with the word european
    test = []
    for w in t:
        #prob = ws["european"]
        prob = t[w]["european"]
        test.append((w, prob))
    print(sorted(test, key=lambda x: x[1], reverse=True)[0:10])
    #print("\n")
    #print("test")
    #return t

[('.', 0.03591034653913155), (',', 0.03372474830401548), ('att', 0.03332533292239441), ('och', 0.028216364762232687), ('i', 0.027025686545253), ('europeiska', 0.02597178687247437), ('det', 0.020548575860822177), ('för', 0.01956648122988557), ('som', 0.018686658041077588), ('en', 0.017656463635023014)]
[('europeiska', 0.16755995477109858), ('.', 0.05111446624924288), (',', 0.047113972999060276), ('att', 0.044936728607888374), ('i', 0.0439457468882919), ('och', 0.040990894746709354), ('en', 0.028060265892112347), ('för', 0.027456514082219212), ('unionen', 0.02710952557008033), ('det', 0.027010613200927517)]
[('europeiska', 0.3999353271255518), ('i', 0.04029466146199781), ('.', 0.03957039283243889), (',', 0.03726033558457288), ('att', 0.03416472425393751), ('och', 0.033111329979174), ('europeisk', 0.03222043498307792), ('unionen', 0.029840331102603732), ('en', 0.025488676146094496), ('den', 0.024758186528307996)]
[('europeiska', 0.5716593290434614), ('europeisk', 0.042219696757371), ('i',

KeyboardInterrupt: 

In [42]:
def translation_model(in_sentence, out_sentence, t):
    words = in_sentence.split()
    words2 = out_sentence.split()
    prob = 0
    for w1 in words:
        max_prob = 0
        for w2 in words2:
            temp_prob = t[w1][w2]
            if temp_prob > max_prob:
                max_prob = temp_prob
        #print(max_prob)
        prob += math.log(max_prob) #+ math.log(1/len(out_sentence))
    return prob

In [43]:
translation_model("jag är bra", "i is good", t)

-1.0253785500545938

In [11]:
translation_model("jag är bra", "i is bad", t)

-12.550998882407782

The above function computes $P(F | E, A)$

### d) Decoding

The most simple case: Only consider translation model based on most probable alignment

In [12]:
def simple_decode(sentence, t):
    words = sentence.split()
    eng = ""
    for word in words:
        if len(t[word]) == 0:
            eng += word + " "
        else:
            eng += Counter(t[word]).most_common(1)[0][0] + " "
    return eng.strip()

In [18]:
Counter(t["i"]).most_common(3)

[('in', 0.7017256406325446),
 ('into', 0.4937823209075722),
 ('treaty', 0.39795004259109684)]

In [19]:
simple_decode("mycket bra är jag", t)

'very good is i'

In [20]:
simple_decode("jag är mycket bra", t)

'i is very good'

In [24]:
simple_decode("herr talman jag", t)

'mr president i'

In [25]:
simple_decode("våra medlemsstater", t)

'our states'

The implementation below will construct a set of candidate translations based on the 3 msot probably alignments for each word in the swedish sentence. It will pick the sentence that maximizes $P(E) * P(F | E)$.

In [39]:
from itertools import permutations 

def permute(l):
    if len(l) == 1:
        return l[0]
    else:   
        lnew = []
        for a in l[0]:
            for b in permute(l[1:]):
                lnew.append(a+" " + b)
        return lnew
    
def decode(swe, t, model):
    words = swe.split()
    candidate_sentences = []
    # Find the 3 english words that are most probable to be aligned with each swedish word
    for word in words:
        sentence = []
        mc = Counter(t[word]).most_common(3)
        for i,_ in mc:
            sentence.append(i)
        candidate_sentences.append(sentence)
    perm_sentences = permute(candidate_sentences)
    
    max_prob = -np.inf
    best_sentence = "UNKNOWN"
    for s in perm_sentences:
        perm = list(permutations(s.split()))
        for p in perm:
            eng_sentence = ' '.join(p)
            prob = language_model(eng_sentence, model) + translation_model(swe, eng_sentence, t)
            if prob >= max_prob:
                max_prob = prob
                best_sentence = eng_sentence
    return best_sentence

In [39]:
Counter(t["domstolen"]).most_common(5)

[('court', 0.2908739985965263),
 ('acquitted', 0.15787944446357263),
 ('referrals', 0.026387736694697846),
 ('non-application', 0.026387736694697846),
 ('non-transposition', 0.026387736694697846)]

In [28]:
decode("jag är bra", t, model)

'i am good'

In [29]:
decode("jag är mycket bra", t, model)

'i am very good'

In [30]:
decode("mycket bra är jag", t, model)

'i am very good'

In [31]:
decode("jag mycket är bra", t, model)

'i am very good'

In [32]:
decode("våra medlemsstater", t, model)

'our member'

In [33]:
decode("jag vill vet om jag", t, model)

'i know if i want'

In [35]:
decode("domstolen har friat honom", t, model)

'have acquitted acquitted him'

In [36]:
simple_decode("domstolen har friat honom", t)

'court has acquitted him'

In [1010]:
language_model("our states", model)

-11.130075819478984

In [1085]:
translation_model("våra medlemsstater", "our member", t)

-2.8656150459222722

In [1086]:
translation_model("våra medlemsstater", "our states", t)

-2.600741149878915