In [1]:
import os
import sys
import numpy as np
import collections
import time

In [2]:
from cloud.serialization.cloudpickle import dumps

DEBUG:Cloud:Log file (/Users/sandeepsubramanian/.picloud/cloud.log) opened


In [3]:
path_to_data = '/Users/sandeepsubramanian/CMU/MachineTranslation2/sp2016.11-731/hw1/data/dev-test-train.de-en'

In [4]:
lines = [line.strip().split('|||') for line in open(path_to_data, 'r')]

In [5]:
german_lines = [line[0].strip().lower().split() for line in lines]
english_lines = [line[1].strip().lower().split() for line in lines]

In [6]:
assert len(german_lines) == len(english_lines) == len(lines)

In [7]:
english_vocab = {}
word_count = 0
for line in english_lines:
    for word in line:
        if word not in english_vocab:
            english_vocab[word] = word_count
            word_count += 1

In [8]:
german_vocab = {}
word_count = 0
for line in german_lines:
    for word in line:
        if word not in german_vocab:
            german_vocab[word] = word_count
            word_count += 1

In [9]:
english_lines = [collections.Counter(english_sentence) for english_sentence in english_lines]
german_lines = [collections.Counter(german_sentence) for german_sentence in german_lines]

In [10]:
uniform_probability = 1.0 / len(german_vocab)
t = collections.defaultdict(lambda: uniform_probability) # Prevents pre-allocation of memory for entire CPT

In [12]:
"""
Pseudocode: Source - https://www.cl.cam.ac.uk/teaching/1011/L102/clark-lecture3.pdf Page 25
initialize t(f|e) uniformly
do
   set count(f|e) to 0 for all f,e
   set total(e) to 0 for all e
   for all sentence pairs (f_s,e_s)
       for all unique words f in f_s
           n_f = count of f in f_s
           total_s = 0
           for all unique words e in e_s
               total_s += t(f|e) * n_f
           for all unique words e in e_s
               n_e = count of e in e_s
               count(f|e) += t(f|e) * n_f * n_e / total_s
               total(e) += t(f|e) * n_f * n_e / total_s
   for all e in domain( total(.) )
       for all f in domain( count(.|e) )
           t(f|e) = count(f|e) / total(e)
until convergence
"""
num_iter = 72
prev_prob = uniform_probability
for ind in range(num_iter):
 
    count = collections.defaultdict(float)
    total = collections.defaultdict(float)
   
    for english_sentence, german_sentence in zip(english_lines, german_lines):
        
        for english_word in english_sentence:
            total_sentence = 0
            
            for german_word in german_sentence:
                total_sentence += t[(english_word, german_word)] * english_sentence[english_word]
            
            for german_word in german_sentence:
                x = t[(english_word, german_word)] * english_sentence[english_word] * german_sentence[german_word] / total_sentence
                count[(english_word, german_word)] += x
                total[german_word] += x
 
    for english_word, german_word in count.keys():
        t[(english_word, german_word)] = count[(english_word, german_word)] / total[german_word]
   
    # EM convergence criteria
    #if t[('the', 'der')] - prev_prob < 1e-4:
    #    break
   
    print t[('the', 'der')]

0.244177898524
0.25282872332


In [21]:
def penalty_term(i,j,index):
    if index == 2:
        return abs(i-j)
    return (i-j)*2

In [None]:
german_lines = [line[0].strip().lower().split() for line in lines]
english_lines = [line[1].strip().lower().split() for line in lines]

f = open('output.txt','w')
for english_sent, german_sent in zip(english_lines, german_lines):
    aligned_words = []
    for ind1, eng_word in enumerate(english_sent):
        translation_scores = [(1.0 / ((1.0 + penalty_term(float(ind1)/len(english_sent),float(ind2)/len(german_sent),2)))) * t[(eng_word, german_word)] for ind2, german_word in enumerate(german_sent)]
        aligned_word = np.argmax(np.array(translation_scores).astype(np.float32))
        aligned_words.append('%d-%d' % (aligned_word, ind1) )
    f.write(' '.join(aligned_words) + '\n')
f.close()

In [20]:
german_lines = [line[0].strip().lower().split() for line in lines]
english_lines = [line[1].strip().lower().split() for line in lines]
g = open('german_sents.txt', 'w')
for line in german_lines:
    g.write(' '.join(line) + '\n')
g.close()
g = open('english_sents.txt', 'w')
for line in english_lines:
    g.write(' '.join(line) + '\n')
g.close()