In [35]:
# This method takes the input file name and returns the text from whole file
# If the notebook and file toread are not in the same directory, the whole path with file name is required
def load_file(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [36]:
# This method creates list of pair of sentences where each sentence is a list of token
def generatePair(text1, text2):
    pairs = []
    text1 = text1.lower().strip().split("\n")
    text2 = text2.lower().strip().split("\n")
    for index, eSentence in enumerate(text1):
        pairs.append((text2[index].split(), text1[index].split()))
    return pairs

In [37]:
# This method generates all translation probability using IBM 1 model
# it iterates until all translation probability difference(previous and current) are less than a threshold 
def calculateTranslationProbability(pairs, threshold):
    currentTranslationProbability = calculateUniformTranslationProbabilities(pairs)
    previousTranslationProbability = {}
    
    iteratoinCount =0;
    while not IsTranslationProbConverged(previousTranslationProbability, currentTranslationProbability, threshold):
        previousTranslationProbability = currentTranslationProbability.copy()
        
        countFE = {}
        totalE = {}
        #Initialize all count(f,e) and total(e) to 0
        for pair in pairs:
            for wordF in pair[0]:
                for wordE in pair[1]:
                    countFE[wordF + "|" + wordE] = 0
                    totalE[wordE] = 0
        for pair in pairs:
            for wordF in pair[0]:
                sumF = 0
                for wordE in pair[1]:
                    sumF += currentTranslationProbability[wordF + "|" + wordE]
                for wordE in pair[1]:
                    countFE[wordF + "|" + wordE] += (currentTranslationProbability[wordF + "|" + wordE] / sumF)
                    totalE[wordE] +=  (currentTranslationProbability[wordF + "|" + wordE] / sumF)
        for key in currentTranslationProbability:
            tmpKeys = key.split('|')
            currentTranslationProbability[key] = countFE[key] / totalE[tmpKeys[1]]
        iteratoinCount += 1
#        if iteratoinCount % 100 == 0:
#            print("************Current Iteration Count: ", iteratoinCount)
    return currentTranslationProbability;

In [38]:
# This method generates initial posterior probability P(f|e) uniformly 
# by counting count(f,e) and count(e) in the whole data set
def calculateUniformTranslationProbabilities(pairs):
    uniformPosterior = {}
    FECount = {}
    ECount = {}
    for pair in pairs:
        for wordF in pair[0]:
            for wordE in pair[1]:
                key = wordF + "|" + wordE
                if key in FECount.keys():
                    FECount[key] += FECount[key]
                else:
                    FECount[key] = 1
                    
                if wordE in ECount.keys():
                    ECount[wordE] += 1
                else:
                    ECount[wordE] = 1
    
    for pair in pairs:
        for wordF in pair[0]:
            for wordE in pair[1]:
                key = wordF + "|" + wordE
                try:
                    uniformPosterior[key] = FECount[key] / ECount[wordE]
                except:
                    uniformPosterior[key] = 0.000001

    return uniformPosterior

In [39]:
# This method checks whether two translation probability dictionary are 
# equal or within a threshold difference or not
def IsTranslationProbConverged(previousTranslationProbabilities, currentTranslationProbabilities, threshold):
    if(len(previousTranslationProbabilities) == 0 or len(currentTranslationProbabilities) == 0):
        return False
    if previousTranslationProbabilities == currentTranslationProbabilities:
        return True
    else:
        for key in previousTranslationProbabilities:
            if abs(previousTranslationProbabilities[key] - currentTranslationProbabilities[key]) > threshold :
                return False
            
        return True

In [47]:
# This method reuturns the best alignment of word in a pair of sentence
def calculateAlignment(translationProbability, pairs):
    
    alignedWordList = []
    alignedWordIndextList = []
    for pair in pairs:
        alignedWordListPerPair = dict()
        alignedWordIndextListPerPair = []
        for index1, wordF in enumerate(pair[0]):
            bestAlignment = 0;
            bestAligedWord = ""
            bestAligedWordIndex = 0
            
            for index2, wordE in enumerate(pair[1]):
                key = wordF + "|" + wordE
                if translationProbability[key] > bestAlignment :
                    bestAlignment = translationProbability[key]
                    bestAligedWord = wordE
                    bestAligedWordIndex = index2
                    
            alignedWordListPerPair[(wordF, bestAligedWord)] = bestAlignment
            alignedWordIndextListPerPair. append((index1, bestAligedWordIndex))
            
        alignedWordList.append(alignedWordListPerPair)
        alignedWordIndextList.append(alignedWordIndextListPerPair)
        
    return alignedWordList, alignedWordIndextList

## Problem 1

In [41]:
germanText = load_file('de-en/de-en.de')
englishText = load_file('de-en/de-en.en')
pairs = generatePair(englishText, germanText)
translationProb = calculateTranslationProbability(pairs, 0.001)

### Analysis and Data Sturcture used
All of the translation probabilities are stored in a dictionary with key as string of f|e . Since using a sparse matrix will take more space, i used here dictionary. For spare matrix, it will take (numberUniqueofEnglishword * numberOfUniqueGermanWord) cells

## Problem 2

In [45]:
alignemdWord, alignment = calculateAlignment(translationProb, pairs[:3])
for index, a in enumerate(alignemdWord):
    print("Alignment Probability:: ",a)
    print("Alignment:: ", alignment[index])

Alignment Probability::  {('reprise', 'session'): 0.5, ('de', 'session'): 0.45643054327147176, ('la', 'session'): 0.3548964891529032, ('session', 'session'): 0.5}
Alignment::  [(0, 3), (1, 3), (2, 3), (3, 3)]
Alignment Probability::  {('je', '.'): 0.99017164383373, ('déclare', '.'): 0.07751931914226108, ('reprise', '.'): 0.5, ('la', '.'): 0.3099292239925069, ('session', '.'): 0.5, ('du', '.'): 0.25796567488196975, ('parlement', '.'): 1.0, ('européen', '.'): 0.2662907774759793, ('qui', '.'): 0.16871712371585584, ('avait', '.'): 0.07751931914226108, ('été', '.'): 0.13346568399152908, ('interrompue', '.'): 0.07751931914226108, ('le', '.'): 0.1951596361717784, ('vendredi', '.'): 0.5, ('17', '.'): 0.5597170916694157, ('décembre', '.'): 0.61965197999123, ('dernier', '.'): 0.5, ('et', '.'): 0.9596941181462424, ('vous', '.'): 1.0, ('renouvelle', '.'): 0.07751931914226108, ('tous', '.'): 0.07751931914226108, ('mes', '.'): 0.07323201155729638, ('vux', '.'): 0.33370643195200134, ('en', '.'): 0.40

In the above, The word alignment is printed in following format for each sentence pair: (german, english) : probability.

## Problem 3

In [48]:
frenchText = load_file('fr-en/fr-en.fr')
englishText = load_file('fr-en/fr-en.en')
pairs = generatePair(englishText, frenchText)

translationProb = calculateTranslationProbability(pairs, 0.001)


In [49]:
alignemdWord, alignment = calculateAlignment(translationProb, pairs[:3])

for index, a in enumerate(alignemdWord):
    print("Alignment Probability:: ",a)
    print("Alignment:: ", alignment[index])

Alignment Probability::  {('reprise', 'session'): 0.5, ('de', 'of'): 0.45643054327147176, ('la', 'resumption'): 0.3548964891529032, ('session', 'session'): 0.5}
Alignment::  [(0, 3), (1, 1), (2, 0), (3, 3)]
Alignment Probability::  {('je', 'i'): 0.99017164383373, ('déclare', 'declare'): 0.07751931914226108, ('reprise', 'session'): 0.5, ('la', 'the'): 0.3099292239925069, ('session', 'session'): 0.5, ('du', '17'): 0.25796567488196975, ('parlement', 'parliament'): 1.0, ('européen', 'european'): 0.2662907774759793, ('qui', 'again'): 0.16871712371585584, ('avait', 'declare'): 0.07751931914226108, ('été', '17'): 0.13346568399152908, ('interrompue', 'declare'): 0.07751931914226108, ('le', 'the'): 0.1951596361717784, ('vendredi', 'friday'): 0.5, ('17', '17'): 0.5597170916694157, ('décembre', 'december'): 0.61965197999123, ('dernier', 'friday'): 0.5, ('et', 'and'): 0.9596941181462424, ('vous', 'you'): 1.0, ('renouvelle', 'declare'): 0.07751931914226108, ('tous', 'declare'): 0.07751931914226108,

In the above, The word alignment is printed in following format for each sentence pair: (french, english) : probability.