In [1]:
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
from typing import List, Set, Dict
from scipy.sparse import dok_matrix
import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
f = open("C:/Users/alexm/Desktop/word_alignments/europarl-v7.sv-en.en", 'r', encoding='utf8')
en_corp = f.readlines()
f.close()

g = open("C:/Users/alexm/Desktop/word_alignments/europarl-v7.sv-en.sv", 'r', encoding='utf8')
sv_corp = g.readlines()
g.close()

## Dataset stats

In [3]:
print(len(en_corp))
print(len(sv_corp))

1862234
1862234


Since my naive implementation of the word alignment algorithm seems to take a lot of time, I will restrict the corpora to the first 100k lines.

In [4]:
en_corp = en_corp[:100000]
sv_corp = sv_corp[:100000]

In [5]:
# Number of unique tokens in the two corpora.
def get_unique_tokens(corp: List[str]) -> Set[str]:
    unique_tokens = set()
    tokenizer = RegexpTokenizer(r'\w+\'?\w*')
    for line in corp:
        tokens = tokenizer.tokenize(line)
        for token in tokens:
            if token != '':
                unique_tokens.add(token)
    return unique_tokens

In [6]:
en_unique_tokens = get_unique_tokens(en_corp)

In [7]:
sv_unique_tokens = get_unique_tokens(sv_corp)

In [9]:
# Transforming tokens to int IDs so we can use a sparse matrix in the algorithm.
def assign_token_ids(unique_tokens: Set[str]) -> Dict[str, int]:
    curr_id = 0
    token_ids = dict()
    for token in unique_tokens:
        token_ids[token] = curr_id
        curr_id += 1
    return token_ids

In [11]:
en_token_ids = assign_token_ids(en_unique_tokens)
sv_token_ids = assign_token_ids(sv_unique_tokens)

In [12]:
print("Number of unique tokens in the English corpus: \n" + str(len(en_unique_tokens)))
print("Number of unique tokens in the Swedish corpus: \n" + str(len(sv_unique_tokens)))

Number of unique tokens in the English corpus: 
32330
Number of unique tokens in the Swedish corpus: 
71260


In [13]:
print(len(en_unique_tokens) * len(sv_unique_tokens))

2303835800


If we hadn't truncated the corpora, we would have had 300B maximum possible word alignments. This way, the number is reduced to 2B.

In [14]:
def tokenise_corpus(corp: List[str]) -> List[List[str]]:
    tokenised_corp = []
    tokenizer = RegexpTokenizer(r'\w+\'?\w*')
    for line in corp:
        tokens = tokenizer.tokenize(line)
        tokenised_corp += [tokens]
    return tokenised_corp

In [15]:
en_tokenised_corp = tokenise_corpus(en_corp)
sv_tokenised_corp = tokenise_corpus(sv_corp)

In [16]:
en_tokenised_corp[100]

['Madam',
 'President',
 'I',
 'do',
 'not',
 'wish',
 'to',
 'reopen',
 'the',
 'debate',
 'but',
 'I',
 'had',
 'also',
 'asked',
 'for',
 'the',
 'floor',
 'to',
 'comment',
 'on',
 'Mr',
 'Barón',
 "Crespo's",
 'motion']

In [17]:
theta_init = 1/len(sv_unique_tokens)
print(theta_init)

1.4033118158854897e-05


In [18]:
# Data structures needed for the algorithm.
counts = dict()
counts_english = dict()
params = dok_matrix((len(sv_unique_tokens),len(en_unique_tokens)), dtype=np.float32)

In [21]:
# First step of the IBM Model 1 alignment algorithm.
start_time = time.time()

nr_iterations = 5
for iteration in range(nr_iterations):
    print("Iteration %i started."%(iteration))
    
    # Reset the counts.
    counts = dict()
    counts_english = dict()
    
    for ii in range(len(en_tokenised_corp)):
        if ii % 10000 == 0:
            print(ii)
            print(time.time() - start_time)

        for sv_word in sv_tokenised_corp[ii]:
            normalisation_term = 0
            
            for en_word in en_tokenised_corp[ii]:
                if iteration == 0:
                    normalisation_term += theta_init
                else:
                    normalisation_term += params[sv_token_ids[sv_word], en_token_ids[en_word]]

            for en_word in en_tokenised_corp[ii]:
                if iteration == 0:
                    expected_count = theta_init/normalisation_term
                else:
                    expected_count = params[sv_token_ids[sv_word], en_token_ids[en_word]]/normalisation_term

                if (sv_word, en_word) in counts:
                    counts[(sv_word, en_word)] += expected_count
                else:
                    counts[(sv_word, en_word)] = expected_count

                if en_word in counts_english:
                    counts_english[en_word] += expected_count
                else:
                    counts_english[en_word] = expected_count

    for (sv_word, en_word) in counts:
        params[sv_token_ids[sv_word], en_token_ids[en_word]] = counts[(sv_word, en_word)]/counts_english[en_word]

print("Total elapsed time: %f"%(time.time()-start_time))

Iteration 0 started.
0
0.00049591064453125
10000
7.0302910804748535
20000
14.689014434814453
30000
22.645759105682373
40000
30.82577681541443
50000
39.21361780166626
60000
47.932785511016846
70000
57.48820948600769
80000
67.04710459709167
90000
76.79299092292786
Iteration 1 started.
0
246.7475836277008
10000
436.6368787288666
20000
632.462290763855
30000
831.3624000549316
40000
1010.1958744525909
50000
1178.241361618042
60000
1335.6378767490387
70000
1508.6536049842834
80000
1677.3288881778717
90000
1841.9046199321747
Iteration 2 started.
0
2151.2088871002197
10000
2312.7137899398804
20000
2484.801346540451
30000
2654.1228511333466
40000
2806.119827270508
50000
2958.5612194538116
60000
3128.178550720215
70000
3324.0444185733795
80000
3521.647987127304
90000
3716.576631784439
Iteration 3 started.
0
4086.761109352112
10000
4280.202750921249
20000
4475.504882335663
30000
4672.576740503311
40000
4849.30916261673
50000
5029.074121952057
60000
5215.2483830451965
70000
5415.835385560989
80000

Most probable alignment for the words in the first 10 sentences after 1 iteration.

In [30]:
for ii in range(10):
    for en_word in en_tokenised_corp[ii]:
        best_prob = -1
        best_sv_word = ''
        for sv_word in sv_tokenised_corp[ii]:
            if params[sv_token_ids[sv_word], en_token_ids[en_word]] > best_prob:
                best_prob = params[sv_token_ids[sv_word], en_token_ids[en_word]]
                best_sv_word = sv_word
        print(en_word + ' - ' + best_sv_word)

Resumption - Återupptagande
of - av
the - av
session - sessionen
I - jag
declare - förklarar
resumed - efter
the - att
session - session
of - att
the - att
European - att
Parliament - Europaparlamentets
adjourned - Europaparlamentets
on - på
Friday - på
17 - 17
December - december
1999 - och
and - och
I - jag
would - vill
like - vill
once - en
again - nytt
to - att
wish - vill
you - ni
a - en
happy - att
new - nytt
year - år
in - att
the - att
hope - hoppas
that - att
you - ni
enjoyed - haft
a - en
pleasant - jag
festive - gott
period - den
Although - den
as - som
you - ni
will - i
have - har
seen - har
the - i
dreaded - kunnat
millennium - 2000
bug' - buggen
failed - har
to - som
materialise - stora
still - i
the - i
people - som
in - i
a - ett
number - antal
of - av
countries - i
suffered - drabbats
a - ett
series - antal
of - av
natural - naturkatastrofer
disasters - naturkatastrofer
that - som
truly - verkligen
were - som
dreadful - förskräckliga
You - Ni
have - har
requested - beg

In [137]:
print(en_tokenised_corp[0])

['Resumption', 'of', 'the', 'session']


In [139]:
print(sv_tokenised_corp[0])

['Återupptagande', 'av', 'sessionen']


In [29]:
for en_word in en_tokenised_corp[0]:
    for sv_word in sv_tokenised_corp[0]:
        print(en_word + " - " + sv_word)
        print(params[sv_token_ids[sv_word], en_token_ids[en_word]])

Resumption - Återupptagande
0.4459885
Resumption - av
0.22032505
Resumption - sessionen
0.3334858
of - Återupptagande
4.469413e-17
of - av
0.19555347
of - sessionen
6.3065104e-12
the - Återupptagande
1.566281e-18
the - av
0.069594085
the - sessionen
6.8032455e-12
session - Återupptagande
8.53622e-06
session - av
0.022391718
session - sessionen
0.088088855


Pretty cool!