In [47]:
# This method takes the input file name and returns the text from whole file
# If the notebook and file toread are not in the same directory, the whole path with file name is required
def load_file(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [48]:
# This method creates list of pair of sentences where each sentence is a list of token
def generatePair(text1, text2):
    pairs = []
    text1 = text1.lower().strip().split("\n")
    text2 = text2.lower().strip().split("\n")
    for index, eSentence in enumerate(text1):
        pairs.append((text2[index].split(), text1[index].split()))
    return pairs

In [49]:
# This method generates all translation probability using IBM 1 model
# it iterates until all translation probability difference(previous and current) are less than a threshold 
def calculateTranslationProbability(pairs, threshold):
    currentTranslationProbability = calculateUniformTranslationProbabilities(pairs)
    previousTranslationProbability = {}
    
    iteratoinCount =0;
    while not IsTranslationProbConverged(previousTranslationProbability, currentTranslationProbability, threshold):
        previousTranslationProbability = currentTranslationProbability.copy()
        
        countFE = {}
        totalE = {}
        #Initialize all count(f,e) and total(e) to 0
        for pair in pairs:
            for wordF in pair[0]:
                for wordE in pair[1]:
                    countFE[wordF + "|" + wordE] = 0
                    totalE[wordE] = 0
        for pair in pairs:
            for wordF in pair[0]:
                sumF = 0
                for wordE in pair[1]:
                    sumF += currentTranslationProbability[wordF + "|" + wordE]
                for wordE in pair[1]:
                    countFE[wordF + "|" + wordE] += (currentTranslationProbability[wordF + "|" + wordE] / sumF)
                    totalE[wordE] +=  (currentTranslationProbability[wordF + "|" + wordE] / sumF)
        for key in currentTranslationProbability:
            tmpKeys = key.split('|')
            currentTranslationProbability[key] = countFE[key] / totalE[tmpKeys[1]]
        iteratoinCount += 1
#        if iteratoinCount % 100 == 0:
#            print("************Current Iteration Count: ", iteratoinCount)
    return currentTranslationProbability;

In [50]:
# This method generates initial posterior probability P(f|e) uniformly 
# by counting count(f,e) and count(e) in the whole data set
def calculateUniformTranslationProbabilities(pairs):
    uniformPosterior = {}
    FECount = {}
    ECount = {}
    for pair in pairs:
        for wordF in pair[0]:
            for wordE in pair[1]:
                key = wordF + "|" + wordE
                if key in FECount.keys():
                    FECount[key] += FECount[key]
                else:
                    FECount[key] = 1
                    
                if wordE in ECount.keys():
                    ECount[wordE] += 1
                else:
                    ECount[wordE] = 1
    
    for pair in pairs:
        for wordF in pair[0]:
            for wordE in pair[1]:
                key = wordF + "|" + wordE
                try:
                    uniformPosterior[key] = FECount[key] / ECount[wordE]
                except:
                    uniformPosterior[key] = 0.000001

    return uniformPosterior

In [51]:
# This method checks whether two translation probability dictionary are 
# equal or within a threshold difference or not
def IsTranslationProbConverged(previousTranslationProbabilities, currentTranslationProbabilities, threshold):
    if(len(previousTranslationProbabilities) == 0 or len(currentTranslationProbabilities) == 0):
        return False
    if previousTranslationProbabilities == currentTranslationProbabilities:
        return True
    else:
        for key in previousTranslationProbabilities:
            if abs(previousTranslationProbabilities[key] - currentTranslationProbabilities[key]) > threshold :
                return False
            
        return True

In [54]:
# This method reuturns the best alignment of word in a pair of sentence
def calculateAlignment(translationProbability, pairs):
    
    alignedWordList = []
    alignedWordIndextList = []
    for pair in pairs:
        alignedWordListPerPair = dict()
        alignedWordIndextListPerPair = []
        for index1, wordF in enumerate(pair[0]):
            bestAlignment = 0;
            bestAligedWord = ""
            bestAligedWordIndex = 0
            
            for index2, wordE in enumerate(pair[1]):
                key = wordF + "|" + wordE
                if translationProbability[key] > bestAlignment :
                    bestAlignment = translationProbability[key]
                    bestAligedWord = wordE
                    bestAligedWordIndex = index2
                    
            alignedWordListPerPair[(wordF, bestAligedWord)] = bestAlignment
            alignedWordIndextListPerPair. append((index1, bestAligedWordIndex))
            
        alignedWordList.append(alignedWordListPerPair)
        alignedWordIndextList.append(alignedWordIndextListPerPair)
        
    return alignedWordList, alignedWordIndextList


## Problem 1

In [55]:
germanText = load_file('de-en/de-en.de')
englishText = load_file('de-en/de-en.en')
pairs = generatePair(englishText, germanText)

# 0.0001 is the threshold to measure convergence
translationProb = calculateTranslationProbability(pairs[:100], 0.0001)

### Analysis and Data Sturcture used
All of the translation probabilities are stored in a dictionary with key as string of f|e . Since using a sparse matrix will take more space, i used here dictionary. For spare matrix, it will take (numberUniqueofEnglishword * numberOfUniqueGermanWord) cells

The IBM 1 model is ran on 100 sentence pair. Please change line 6 above **translationProb = calculateTranslationProbability(pairs[:100], 0.0001)** to **translationProb = calculateTranslationProbability(pairs, 0.0001)** to run it for all sentence pair

## Problem 2

In [56]:
alignemdWord, alignment = calculateAlignment(translationProb, pairs[:3])
for index, a in enumerate(alignemdWord):
    print("Alignment Probability:: ",a)
    print("Alignment:: ", alignment[index])

Alignment Probability::  {('wiederaufnahme', 'resumption'): 0.9993893603259384, ('der', 'of'): 0.7526721556935149, ('sitzungsperiode', 'session'): 0.9999999994181575}
Alignment::  [(0, 0), (1, 1), (2, 3)]
Alignment Probability::  {('ich', 'i'): 1.0, ('erkläre', 'declare'): 0.05783786644749496, ('die', 'the'): 0.2616889385112021, ('am', 'on'): 0.2698306407656617, ('freitag', 'declare'): 0.05783786644749496, (',', ','): 1.0, ('dem', 'wish'): 0.48377630550734607, ('17.', 'declare'): 0.05783786644749496, ('dezember', 'declare'): 0.05783786644749496, ('unterbrochene', 'declare'): 0.05783786644749496, ('sitzungsperiode', 'session'): 0.9999999994181575, ('des', 'year'): 0.7340922757519008, ('europäischen', 'european'): 1.0, ('parlaments', 'declare'): 0.05783786644749496, ('für', 'period'): 0.8505566148302802, ('wiederaufgenommen', 'declare'): 0.05783786644749496, ('wünsche', 'declare'): 0.05783786644749496, ('ihnen', 'declare'): 0.05783786644749496, ('nochmals', 'once'): 0.22919728167758996, 

In the above, The word alignment and probability is printed in following format for each sentence pair:

word alignment probability- **(german, english) : probability**.

word alignment- **(germanWordIndex, englishWordIndex)**

## Problem 3

In [57]:
frenchText = load_file('fr-en/fr-en.fr')
englishText = load_file('fr-en/fr-en.en')
pairs = generatePair(englishText, frenchText)

# 0.0001 is the threshold to measure convergence
translationProb = calculateTranslationProbability(pairs[:100], 0.0001)

In [58]:
alignemdWord, alignment = calculateAlignment(translationProb, pairs[:3])

for index, a in enumerate(alignemdWord):
    print("Alignment Probability:: ",a)
    print("Alignment:: ", alignment[index])

Alignment Probability::  {('reprise', 'session'): 0.7126420590572695, ('de', 'of'): 0.5105043147838677, ('la', 'the'): 0.39854929347455204, ('session', 'resumption'): 0.5468638889694714}
Alignment::  [(0, 3), (1, 1), (2, 2), (3, 0)]
Alignment Probability::  {('je', 'i'): 1.0, ('déclare', 'declare'): 0.05593406909767865, ('reprise', 'session'): 0.7126420590572695, ('la', 'the'): 0.39854929347455204, ('session', 'year'): 0.4252866183196726, ('du', 'the'): 0.1623967140682424, ('parlement', 'parliament'): 0.9999999999777522, ('européen', 'declare'): 0.05498054657744576, ('qui', 'european'): 0.626069872489693, ('avait', 'declare'): 0.05593406909767865, ('été', 'new'): 0.11552025597133657, ('interrompue', 'declare'): 0.05593406909767865, ('le', 'would'): 0.28630802310956666, ('vendredi', 'declare'): 0.05593406909767865, ('17', 'declare'): 0.05593406909767865, ('décembre', 'declare'): 0.05593406909767865, ('dernier', 'declare'): 0.05593406909767865, ('et', 'and'): 1.0, ('vous', 'you'): 0.9316

In the above, The word alignment and probability is printed in following format for each sentence pair:

word alignment probability- **(french, english) : probability**.

word alignment- **(frenchWordIndex, englishWordIndex)**

The IBM 1 model is ran on 100 sentence pair. Please change line 6 above **translationProb = calculateTranslationProbability(pairs[:100], 0.0001)** to **translationProb = calculateTranslationProbability(pairs, 0.0001)** to run it for all sentence pairs