In the implementation part of the module, you will in some cases be told exactly what to do, and in other cases you will have to come up with your own solution to a tricky problem. In the latter cases, it is crucial that you describe the choices you made and your reasoning behind them. 

# A) Warm up

Let's assume that we pick a word completely randomly from the European parliament proceedings. According to your estimate, what is the probability that it is speaker? What is the probability that it is zebra?     

In [None]:
from collections import Counter

# ten most common words for every language
not_english = ['europarl-v7.de-en.lc.de', 'europarl-v7.fr-en.lc.fr', 'europarl-v7.sv-en.lc.sv']

for language in not_english:
    text = ""
    with open(language) as stream:
        text = stream.read()
    words = text.split(" ") # string to list
    c= Counter(words) 
    print("For "+ language + " it's: ")
    print(c.most_common(10))

For europarl-v7.de-en.lc.de it's: 
[(',', 18549), ('die', 9649), ('der', 9139), ('und', 6920), ('in', 3934), ('zu', 3136), ('den', 2955), ('daß', 2725), ('von', 2448), ('für', 2432)]
For europarl-v7.fr-en.lc.fr it's: 
[('&apos;', 16729), (',', 15400), ('de', 14444), ('la', 9239), ('et', 6539), ('l', 6254), ('le', 5733), ('à', 5353), ('les', 5260), ('des', 5195)]
For europarl-v7.sv-en.lc.sv it's: 
[('att', 9138), (',', 8875), ('och', 6950), ('i', 5599), ('som', 4958), ('för', 4699), ('det', 4524), ('av', 3979), ('är', 3802), ('en', 3632)]


In [None]:
#merging all english docs
tot_text = []
all_english = ['europarl-v7.de-en.lc.en', 'europarl-v7.fr-en.lc.en', 'europarl-v7.sv-en.lc.en']

for language in all_english:
    text=""
    with open(language) as stream:
        text = stream.read()
    words = text.split(" ")
    tot_text.extend(words)

c= Counter(tot_text)
print("For English it's: ")
print(c.most_common(10))

For English it's: 
[('the', 55362), (',', 42038), ('of', 28281), ('to', 26752), ('and', 21257), ('in', 17040), ('is', 13216), ('a', 12801), ('that', 12729), ('for', 8705)]


In [None]:
the_parliament = []
the_parliament.extend(not_english)
the_parliament.extend(all_english)

tot_text =[]
for language in the_parliament:
    text=""
    with open(language) as stream:
        text = stream.read()
    words = text.split(" ")
    tot_text.extend(words)
    
c= Counter(tot_text)
print("For the whole parliament there is a: ")
print(str(c['speaker']/c.total()*100) + "% " + "prob for 'speaker'and a ")
print(str(c['zebra']/c.total()*100) + "% " + "prob for 'zebra'" )

For the whole parliament there is a: 
0.002003123658893535% prob for 'speaker'and a 
0.0% prob for 'zebra'


## B) Language Modeling  - bigram model  (loaded with the parliament)

Found a nice inspirational source: https://dev.to/amananandrai/language-model-implementation-bigram-model-22ij

In [None]:
#Generate sentences from all data
the_parliament = []
the_parliament.extend(not_english)
the_parliament.extend(all_english)

complete_sentences =[]
for language in the_parliament:
    text=""
    with open(language) as stream:
        text = stream.read()
    words = text.split(" .\n")
    complete_sentences.extend(words)   

In [None]:
#read in the data to compare
def readData(listOfSents):
    inData = listOfSents
    listOfWords=[]
    for i in range(len(inData)):
        for word in inData[i].split():
            if word != ",":         ## does this work as intended? 
                listOfWords.append(word)
                
    return listOfWords

In [None]:
#creat bigram and unigram together with a list that hold them
def createBigram(data):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}
   for i in range(len(data)-1):
      if i < len(data) - 1 and data[i+1].islower():

         listOfBigrams.append((data[i], data[i + 1]))

         if (data[i], data[i+1]) in bigramCounts:
            bigramCounts[(data[i], data[i + 1])] += 1  
         else:
            bigramCounts[(data[i], data[i + 1])] = 1

      if data[i] in unigramCounts:
         unigramCounts[data[i]] += 1
      else:
         unigramCounts[data[i]] = 1
   return listOfBigrams, unigramCounts, bigramCounts

In [None]:
#compute the probability of words following each other
def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb

In [None]:
#scripting some nice prints
if __name__ == '__main__':
    data = readData(complete_sentences)
    #print("\n HEY LOOK: ")
    #print(data[:20])
    listOfBigrams, unigramCounts, bigramCounts = createBigram(data)

    #print("\n Some possible Bigrams are ")
    #print(listOfBigrams[0:10])

    #print("\n A Bigram along with its frequency ")
    #print (str(listOfBigrams[0]) + ": ") 
    #print(bigramCounts.get(listOfBigrams[0]))

    #print("\n And the Unigram for \"" + str(listOfBigrams[0][0]) + "\" along with its frequency of: ")
    #print(unigramCounts.get(listOfBigrams[0][0]))

    bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

    #print("\n Bigrams along with their probability ")
    #print(bigramProb)
    inputList="ich erkläre die am freitag"
    splt=inputList.split()
    outputProb = 1
    bilist=[]
    bigrm=[]

    for i in range(len(splt) - 1):
        if i < len(splt) - 1:
            bilist.append((splt[i], splt[i + 1]))

    #print("\n The bigrams in given sentence are ")
    #print(bilist)
   
    for i in range(len(bilist)):
        if bilist[i] in bigramProb:
            outputProb *= bigramProb[bilist[i]]
        else:
            outputProb *= 0
    print('\n' + 'Probablility of sentence \"' + inputList + '\" = ' + str(outputProb))


Probablility of sentence "ich erkläre die am freitag" = 8.839638675847781e-09


What happens if you try to compute the probability of a sentence that contains a word that did not appear in the training texts? ANS: currently the probability is zero for that word and hence the sentence also has prob = 0. 

"Also if an unknown word comes in the sentence then the probability becomes 0. This problem of zero probability can be solved with a method known as Smoothing. In Smoothing, we assign some probability to unknown words also. Two very famous smoothing methods are: Laplace Smoothing & Good Turing" - Source of boilerplate code 

And what happens if your sentence is very long (e.g. 100 words or more)? ANS: many fairly small probabilities become minuscule, in the end practically zero. (fact check)

## C) Translation modeling

Self-check: if our goal is to translate from some language into English, why does our conditional probability seem to be written backwards? Why don't we estimate P(e|f) instead? ANS: Using Bayes rule (the proportional version) we can decompose the probability at hand and gain division of the tasks of keeping track of contents and fluency, while it also is easier to train the model. From the division of labor, we hence get one language model and one translation model.  (WE MIGHT WANT TO WRITE IT OUT AS A OBJ FUNC TO OPT)

Write code that implements the estimation algorithm for IBM model 1. Then print, for either Swedish, German, or French, the 10 words that the English word european is most likely to be translated into, according to your estimate. It can be interesting to look at this list of 10 words and see how it changes during the EM iterations.

In [8]:
from os import linesep
import string
from collections import Counter, defaultdict
import numpy as np

dataset = {'de-en-de': 'europarl-v7.de-en.lc.de',
          'de-en-en': 'europarl-v7.de-en.lc.en',
          'fr-en-fr': 'europarl-v7.fr-en.lc.fr',
          'fr-en-en': 'europarl-v7.fr-en.lc.en',
          'sv-en-sv': 'europarl-v7.sv-en.lc.sv',
          'sv-en-en': 'europarl-v7.sv-en.lc.en',}

class text_parser:
    def __init__(self, data = dataset):
        self.data = data
        self.sentences = []
        self.words = {}
        self.keys = []

    def parse(self, keys):
        self.keys = keys
        for key in keys:
            with open(self.data[key], 'r') as file:
                sentence = file.readlines()
                words = []
                for s in sentence:
                    words_in_sentence = [word for word in s.split() if word not in string.punctuation]
                    self.sentences.append(words_in_sentence)
                    words.extend(words_in_sentence)
                self.words[key] = words
        
    def count(self, n = 10):
        for key in self.keys:
            most_common = Counter(self.words[key]).most_common(n)
            print('\nFor dataset: '+str(key) +', the most common words are:')
            print(*most_common, sep = "\n")
    
    def get_words(self, all_words = True, unique = False):
        if all_words: 
            all_words = [word for words in self.words.values() for word in words]
            
            if unique: 
                return np.unique(all_words)
            else: 
                return all_words
        else: return self.words
    
    def get_sent(self, n = -1):
        if n == -1: return self.sentences
        else: return self.sentences[0:n]
  
    def prob_words(self,word_list):
        all_words = self.get_words(all_words = True)
        C = Counter(all_words)
        for word in word_list:
            if C[word] == 0: print(word+" is not used in: "+ str(self.keys))
            else: print("Probability of: "+ word +" in "+str(self.keys)+ " is: "+str(C[word]/C.total()))

   

In [9]:
parser = text_parser()
parser.parse(['de-en-de', 'de-en-en'])
parser.count()


For dataset: de-en-de, the most common words are:
('die', 10521)
('der', 9374)
('und', 7028)
('in', 4175)
('zu', 3168)
('den', 2976)
('wir', 2863)
('daß', 2738)
('ich', 2670)
('das', 2669)

For dataset: de-en-en, the most common words are:
('the', 19847)
('of', 9597)
('to', 9059)
('and', 7303)
('in', 6237)
('is', 4478)
('that', 4441)
('a', 4435)
('we', 3372)
('this', 3362)


In [None]:
parser2 = text_parser()
parser2.parse(['fr-en-en', 'de-en-en','sv-en-en'])
parser2.prob_words(['speaker','zebra'])

Probability of: speaker in ['fr-en-en', 'de-en-en', 'sv-en-en'] is: 4.23327120259538e-05
zebra is not used in: ['fr-en-en', 'de-en-en', 'sv-en-en']


In [None]:
all_words = parser2.get_words(all_words =True)
print(all_words[:10])

['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned']


In [10]:
parser3 = text_parser()
parser3.parse(['sv-en-sv'])
parser3.get_sent(n=2)

[['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '&quot;',
  'den',
  'stora',
  'år',
  '2000-buggen',
  '&quot;',
  'aldrig',
  'rum',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga']]

In [13]:
class translationModel: 
    def __init__(self, orig_lang, trans_lang):

        #Initiate "orignal" & "translated" data
        orig_parser = text_parser()
        trans_parser = text_parser()

        #Parse "orignal" & "translated" data
        orig_parser.parse(orig_lang)
        trans_parser.parse(trans_lang)

        self.orig_sent = orig_parser.get_sent()
        self.trans_sent = trans_parser.get_sent()
        #self.trans_probs = None

        self.orig_words = orig_parser.get_words(all_words=True, unique = True)
        self.trans_words = trans_parser.get_words(all_words=True, unique = True)
        
    def calculate_translation_probs(self, n_iter=5, small_value = 0.01):
        #initiate transition probabilities to some small value
        self.trans_probs = defaultdict(lambda: defaultdict(lambda: small_value))
      
        #Define # of EM iterations 
        for i in range(1,n_iter): 

            #Set all counts c(o,t) and c(t) to 0
            ot_count = defaultdict(lambda: defaultdict(lambda: small_value))
            o_count = defaultdict(lambda: small_value)

            #For each sentence pair
            for o_words, t_words in zip(self.orig_sent,self.trans_sent):

                #o_words = o_sent.split()
                #t_words = t_sent.split()

                #Include NULL word to original sentence
                o_words += ["NULL"]

                #For each original word
                for t_word in t_words:

                    #Get transition probability sum
                    tp_sum = 0

                    for o_word in o_words:
                        tp_sum+=self.trans_probs[o_word][t_word]

                    #For each translated word (and null word)
                    for o_word in o_words:

                        #Compute alignment probability
                        align_prob = self.trans_probs[o_word][t_word]/tp_sum

                        #Update Pseudocount of c(o,t)
                        ot_count[o_word][t_word] += align_prob

                        #Update Pseudocount of c(t)
                        o_count[o_word] += align_prob
            print('test')
            # Reestimate transition probabilities
            for o_word, ot_dict in ot_count.items():
                for t_word, _ in ot_dict.items():
                    self.trans_probs[o_word][t_word] = ot_count[o_word][t_word] / o_count[o_word]

    def get_similar_words(self,word, n = 10):
        word_probs = list(self.trans_probs[word].items())
        word_probs.sort(key=lambda x: x[1], reverse=True)
        return word_probs[:n]



In [14]:
model = translationModel(['sv-en-en'],['sv-en-sv'])
model.calculate_translation_probs()
print(*model.get_similar_words(word="european"), sep ="\n")

test
test
test
test
('europeiska', 0.6177156544630333)
('europeisk', 0.04368390926320252)
('i', 0.033485668270207866)
('och', 0.024945272891128172)
('att', 0.02397303257217284)
('unionen', 0.02331994990234653)
('europaparlamentet', 0.02172010375981349)
('den', 0.02129427853973632)
('en', 0.02012773430033462)
('för', 0.01662425479044724)


## D) Decoding

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=27527141-1c2d-41eb-a7d6-3699f108d4e9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>