Imports

In [None]:
import pandas as pd
import collections
from collections import Counter
import re
import numpy as np
from IPython.display import display
import random
import itertools

Read data as DataFrames

In [None]:
danish_de_en = pd.read_csv("europarl-v7.de-en.lc.de", sep='\n', header=None)
english_de_en = pd.read_csv("europarl-v7.de-en.lc.en", sep='\n', header=None)
english_fr_en = pd.read_csv("europarl-v7.fr-en.lc.en", sep='\n', header=None)
french_fr_en = pd.read_csv("europarl-v7.fr-en.lc.fr", sep='\n', header=None)
english_sv_en = pd.read_csv("europarl-v7.sv-en.lc.en", sep='\n', header=None)
swedish_sv_en = pd.read_csv("europarl-v7.sv-en.lc.sv", sep='\n', header=None)

# a) Warm up

Convert df to list of words

In [None]:
# Takes in a dataframe with one column. Each row is a sentence and we want to transform all sentences into a long list of words
def get_list_of_words(sentences_df):
  
  copy_df = sentences_df.copy()
  
  # Split the sentences into words
  copy_df["words"] = copy_df[0].str.split()
  
  # Concatenate all the lists into one long list
  words = sum(copy_df['words'].tolist(), [])
   
  return words

In [None]:
swedish_words = get_list_of_words(swedish_sv_en)
english_words = get_list_of_words(english_sv_en)

Remove all entries that are not words

In [None]:
def only_words(list): 
  symbols = re.compile('[^A-Za-z0-9]+')
  only_words = [symbols.sub('', word) for word in list]
  only_words = [word for word in only_words if len(word)]
  return only_words

swe_only_words = only_words(swedish_words)
eng_only_words = only_words(english_words)

Get top 10 most common words using Counter

In [None]:
word_count_swe = collections.Counter(swe_only_words)
word_count_eng = collections.Counter(eng_only_words)


swe_most_common = pd.DataFrame(word_count_swe.most_common(10), columns=['word', 'count'])
eng_most_common = pd.DataFrame(word_count_eng.most_common(10), columns=['word', 'count'])

print("TOP 10 SWEDISH")
display(swe_most_common)
print("\n")
print("TOP 10 ENGLISH")
display(eng_most_common)

TOP 10 SWEDISH


Unnamed: 0,word,count
0,att,9181
1,och,7038
2,i,5951
3,det,5687
4,fr,5274
5,som,5028
6,r,4124
7,av,4013
8,en,3724
9,vi,3211




TOP 10 ENGLISH


Unnamed: 0,word,count
0,the,19322
1,of,9312
2,to,8801
3,and,6946
4,in,6090
5,is,4400
6,that,4357
7,a,4269
8,we,3223
9,this,3222


Probabilities of speaker and zebra

In [None]:
#'speaker'
nr_eng_words = sum(word_count_eng.values())
freq_speaker = word_count_eng["speaker"]

#'zebra'
freq_zebra = word_count_eng["zebra"]

print("Probability of speaker is: ", (freq_speaker/nr_eng_words)*100,"%", "\nProbability of zebra is: ", (freq_zebra/nr_eng_words)*100 , "%")

Probability of speaker is:  0.00391530414082566 % 
Probability of zebra is:  0.0 %


# b) Language modeling

In [None]:
sentence = " What are you "

# Computes the conditional probability between two english words
def cond_prob_words(word1, word2):
  count_12 = 0 
  count_1 = Counter(english_words)[word1]
  count_2 = Counter(english_words)[word2]
  
  if(count_1 == 0 or count_2 == 0): 
    return 0.0001
  
  else: 

    for i in range(len(english_words)-1): 
      if english_words[i] == word2 and english_words[i+1] == word1: 
        count_12 += 1
    
    return count_12/count_1

# Takes in a sentence and computes the probability for it in English
def lang_model(sentence): 
  sentence = sentence.lower()
  probs = 1
  sentence_words = sentence.split()
  for i in range(len(sentence_words)): 
    # First word => "." is the previous word
    if i == 0: 
      prob = cond_prob_words(sentence_words[i], ".")
      probs = probs*prob
      
    else: 
      prob = cond_prob_words(sentence_words[i], sentence_words[i-1])
      probs = probs*prob


  return(probs)

print("The probability of '", sentence, "' is: ", lang_model(sentence))


The probability of '  What are you  ' is:  1.3149981677692199e-05


# c) Translation modeling

In [None]:
def trans_model(source, target, it):
    # Convert source sentences and target sentences to a list of strings
    source_sentences = source[0].tolist()
    target_sentences = target[0].tolist()

    # Extract unique words from source and target sentences
    source_words = set(word for sentence in source_sentences for word in sentence.split())
    target_words = set(word for sentence in target_sentences for word in sentence.split())

    # Create word indices for source and target words
    source_word_indices = {word: i for i, word in enumerate(source_words)}
    target_word_indices = {word: i for i, word in enumerate(target_words)}

    # Add "NULL" to target words and set its index
    target_words.add("NULL")
    target_word_indices["NULL"] = len(target_words) - 1

    # Initialize the translation probability matrix
    t = np.ones((len(source_words), len(target_words))) / len(target_words)

    # For each EM iteration
    for iteration in range(it):
        # Initialize count matrices
        count_st = np.zeros((len(source_words), len(target_words)))
        count_t = np.zeros(len(target_words))

        # For each sentence pair
        for source_sentence, target_sentence in zip(source_sentences, target_sentences):
            # Append "NULL" to the target sentence
            target_sentence = target_sentence.split() + ["NULL"]

            # For each source word
            for source_word in source_sentence.split():
                source_index = source_word_indices[source_word]
                denominator = sum(t[source_index][target_word_indices[target_word]] for target_word in target_sentence)

                # For each target word
                for target_word in target_sentence:
                    target_index = target_word_indices[target_word]
                    numerator = t[source_index][target_index]
                    delta = numerator / denominator

                    count_st[source_index][target_index] += delta
                    count_t[target_index] += delta

        # Compute the new translation probability matrix
        t = count_st / count_t[np.newaxis, :]
        
        # Print the top 10 translated words for "european"
        
        top_10_translations("european", iteration, source_word_indices, target_word_indices,t)
      
    # Return the translation probability matrix as a pandas DataFrame
    #t = pd.DataFrame(t, index=source_words, columns=target_words)
    return t, source_word_indices, target_word_indices


In [None]:
def top_10_translations(word, iteration, source_word_indices,target_word_indices,t): 
  if word in source_word_indices:
      word_index = source_word_indices[word]
      sorted_target_words = sorted(target_word_indices, key=lambda x: t[word_index][target_word_indices[x]], reverse=True)
      print(f"Iteration {iteration + 1}:")
      print("Top 10 words that '", word, "' is most likely to be translated to:")
      for target_word in sorted_target_words[:10]:
          print(f"{target_word}: {t[word_index][target_word_indices[target_word]]}")
      print()


In [None]:
t, swedish_words_index, english_words_index = trans_model(swedish_sv_en, english_sv_en, 10)

Note: If you want the top 10 translations for european to run you need to train the model with english as the source language. This model is trained in the opposite direction to be compatible with the next assignment. 

# d) Decoder

In [None]:
def decoder(sentence,t, target_index, source_index): 
  sentence = sentence.lower()
  
  # Split sentence into words
  words = sentence.split()
  
  sentence_length = len(words)

  # Create matrix for translations of each word
  translations_for_words = np.zeros((sentence_length,5), dtype=object)
  
  # Add top 5 translations for each word to the matrix
  for i in range(sentence_length): 
    top_5_translations = five_most_likely(words[i],t, target_index, source_index)
    translations_for_words[i] = top_5_translations

  # Get combinations of all translations to get possible translations of entire sentence
  possible_translations = (' '.join(words) for words in itertools.product(*translations_for_words))
  possible_translations = list(possible_translations)
  print(len(possible_translations))

  # Compute probability for each possible translation of source sentence 
  prob_of_possible_translations=[]
  for sentence in possible_translations:
    prob_sentence =lang_model(sentence) 
    prob_of_possible_translations.append(prob_sentence)
  
  # Return sentence with maximum probability
  max_prob_index = prob_of_possible_translations.index(max(prob_of_possible_translations))

  return possible_translations[max_prob_index ]

# Returns list of 5 most likley translations for a sorce word into a target languge 
def five_most_likely(sorce_word, t, source_index, target_index):
    
    # Create new dictionary that maps indecies to words instead
    index_to_word = {v: k for k, v in target_index.items()}

    # Simple solution for words that dont exist: get five random words
    if source_index.get(sorce_word,-1) == -1:
      max_index = max(target_index.values())
      five_random_indicies = random.sample(range(max_index), 5)
      top_5_translations = [index_to_word[idx] for idx in five_random_indicies]

    else: 
      sorce_word_index = source_index[sorce_word]
      translations = t[sorce_word_index].tolist()
      top_5_translation_indices = [i for i, x in sorted(enumerate(translations), key=lambda x: x[1], reverse=True)[:5]]
      
      top_5_translations = [index_to_word[idx] for idx in top_5_translation_indices]

    return top_5_translations


125


In [None]:
decoder("Varför är detta bra", t, swedish_words_index, english_words_index)

625


'why is this good'

In [None]:
decoder("Vad pratar ni om", t, swedish_words_index, english_words_index)

625


'what misappropriation you if'