In [1]:
from scipy.sparse import dok_matrix
import numpy as np
import pickle
import copy

In [2]:
def read_corpus_addnull(english_corpus, foreign_corpus):
    "Reads a corpus and adds in the NULL word."
    english = [["*"] + e_sent.split() for e_sent in open(english_corpus, encoding='utf-8')]
    foreign = [f_sent.split() for f_sent in open(foreign_corpus, encoding='utf-8')]
    return english, foreign

english, spanish = read_corpus_addnull("corpus.en", "corpus.es")

In [3]:
to_remove=[]
for i in range(len(spanish)):
    if len(spanish[i])==0:
        to_remove.append(i)
        
for index in sorted(to_remove, reverse=True):
    del english[index]
    del spanish[index]

In [4]:
spanish_words = set()
english_words = set()
for sent in spanish:
    for word in sent:
        spanish_words.add(word)
for sent in english:
    for word in sent:
        english_words.add(word)

In [5]:
n_e = {}
parallel_corpus = zip(english, spanish)
wordpairs = set()
for e, s in parallel_corpus:
    for e_j in e:
        for s_i in s:
            wordpair = (e_j, s_i)
            if wordpair not in wordpairs:
                wordpairs.add(wordpair)
                if not e_j in n_e:
                    n_e[e_j] = 0
                n_e[e_j]+=1

In [6]:
spanish_words = list(spanish_words)
english_words = list(english_words)

In [7]:
english_indices={};spanish_indices={}
for i in range(len(english_words)):
    english_indices[english_words[i]]=i
for i in range(len(spanish_words)):
    spanish_indices[spanish_words[i]]=i    

In [8]:
base_t_parameters=[]
for i in english_words:
    base_t_parameters.append(1/ n_e[i] )

In [9]:
t_parameters=dok_matrix((len(english_words), len(spanish_words)), dtype=np.float32)
        
def calculate_delta(i, k):
    s_word=spanish[k][i];num=[]
    l=len(english[k]); den=0
    for x in range(l):
        e_word=english[k][x]
        temp=t_parameters[english_indices[e_word],spanish_indices[s_word]]
        if temp==0:
            temp= base_t_parameters[english_indices[e_word]]
        den+=temp
        num.append(temp)
    return np.array(num)/den

In [10]:
def calculate_t_parameters(counts):
    r=counts.sum(1)
    return counts/r

In [11]:
#IBM 1
for s in range(5):
    counts = dok_matrix((len(english_words), len(spanish_words)), dtype=np.float32)
    for k in range(len(english)): 
        for i in range(len(spanish[k])):
            delta= calculate_delta(i, k);s_word=spanish[k][i];s_index=spanish_indices[s_word]
            for j in range(len(english[k])):
                e_word=english[k][j];e_index=english_indices[e_word]
                counts[e_index, s_index] += delta[j]
    t_parameters=calculate_t_parameters(counts)
    

In [12]:
eng_dev = [e_sent.split() for e_sent in open('dev.en', encoding='utf-8')]
spn_dev = [f_sent.split() for f_sent in open('dev.es', encoding='utf-8')]
al=open("dev.out", "w", encoding='utf-8')
for i in range(len(spn_dev)):
    for spn_ind,spn_word in enumerate(spn_dev[i],1):
        if spn_word in spanish_words:
            c_max=0;s_index=spanish_indices[spn_word];c_index=0
            for eng_ind,eng_word in enumerate(eng_dev[i],1):
                e_index= english_indices[eng_word] 
                if t_parameters[e_index,s_index]==0:
                    t_value=base_t_parameters[e_index]
                else:
                    t_value=t_parameters[e_index,s_index]
                if t_value > c_max:
                    c_max=t_value; c_index= eng_ind
            al.write(str(i+1)+' '+str(c_index)+' '+str(spn_ind)+'\n')
al.close()

In [13]:
!python eval_alignment.py 'dev.key' 'dev.out'

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.413        0.427        0.420


In [14]:
q={};q_names_e={};q_names_s={}
for k in range(len(spanish)):
    l=len(english[k]);m=len(spanish[k])
    l1=len(set(english[k]));m1=len(set(spanish[k]))
    if (l,m) not in q.keys():
        q[(l,m)]=dok_matrix((l1, m1), dtype=np.float32)
        q_names_e[(l,m)]= {}
        q_names_s[(l,m)]= {}
        for i in range(len(set(english[k]))):
            q_names_e[(l,m)][list(set(english[k]))[i]]=i
        for i in range(len(set(spanish[k]))):
            q_names_s[(l,m)][list(set(spanish[k]))[i]]=i
    else:
        new_l_e=list(set(english[k]));new_l_s=list(set(spanish[k]))
        for i in range(len(new_l_e)):
            if new_l_e[i] not in q_names_e[(l,m)].keys():
                q_names_e[(l,m)][new_l_e[i]]=len(q_names_e[(l,m)].keys())
        for i in range(len(new_l_s)):
            if new_l_s[i] not in q_names_s[(l,m)].keys():
                q_names_s[(l,m)][new_l_s[i]]= len(q_names_s[(l,m)].keys())
        q[(l,m)]=dok_matrix((len(q_names_e[(l,m)].keys()), len(q_names_s[(l,m)].keys())), dtype=np.float32)
counts_q_initial=copy.deepcopy(q)

In [15]:
def calculate_q_parameters(counts_for_q, q):
    for x in range(len(list(counts_for_q.keys()))):
        (l, m)=list(counts_for_q.keys())[x]
        q[(l, m)]=counts_for_q[(l, m)]/(counts_for_q[(l, m)].sum(1))
    return q
def calculate_delta_2(i, k):
    s_word=spanish[k][i];num=[]
    l=len(english[k]); den=0;m=len(spanish[k])
    for x in range(l):
        e_word=english[k][x]
        temp=t_parameters[english_indices[e_word],spanish_indices[s_word]]
        temp1=q[(l,m)][q_names_e[(l,m)][e_word],q_names_s[(l,m)][s_word]]
        if temp1==0:
            temp1= 1/(l)          
        if temp==0:
            temp= base_t_parameters[english_indices[e_word]]
        den+=(temp*temp1)
        num.append(temp*temp1)
    return np.array(num)/den

In [None]:
#IBM 2
for s in range(1):
    counts = dok_matrix((len(english_words), len(spanish_words)), dtype=np.float32)
    counts_for_q=copy.deepcopy(counts_q_initial)
    for k in range(len(english)): 
        l=len(english[k]);m=len(spanish[k])
        for i in range(len(spanish[k])):
            delta= calculate_delta_2(i, k);s_word=spanish[k][i];s_index=spanish_indices[s_word]
            for j in range(len(english[k])):
                e_word=english[k][j];e_index=english_indices[e_word]
                counts[e_index, s_index] += delta[j]
                counts_for_q[(l, m)][q_names_e[(l,m)][e_word],q_names_s[(l,m)][s_word]] += delta[j]
    t_parameters=calculate_t_parameters(counts)
    q=calculate_q_parameters(counts_for_q, q)

In [19]:
eng_dev = [e_sent.split() for e_sent in open('dev.en', encoding='utf-8')]
spn_dev = [f_sent.split() for f_sent in open('dev.es', encoding='utf-8')]
al=open("dev_2.out", "w", encoding='utf-8')
for k in range(len(spn_dev)):
    l=len(eng_dev[k]);m=len(spn_dev[k])
    for spn_ind,spn_word in enumerate(spn_dev[k],1):
        if spn_word in spanish_words:
            c_max=0;s_index=spanish_indices[spn_word];c_index=0
            for eng_ind,eng_word in enumerate(eng_dev[k],1):
                e_index= english_indices[eng_word] 
                t_value=t_parameters[e_index,s_index]
                if (l,m) not in q.keys():
                    q_value=1e-10
                else:
                    q_value=q[(l,m)][eng_ind-1, spn_ind-1]
                if t_value==0:
                    t_value=base_t_parameters[e_index]
                if q_value ==0:
                    q_value = 1/(l+1)
                prd= (q_value)*(t_value)
                if prd > c_max:
                    c_max=prd; c_index= eng_ind
            al.write(str(k+1)+' '+str(c_index)+' '+str(spn_ind)+'\n')
al.close()

In [29]:
!python eval_alignment.py 'dev.key' 'dev.out'

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.443        0.457        0.450


In [20]:
#Growing Alignments
eng_dev = [e_sent.split() for e_sent in open('dev.en', encoding='utf-8')]
spn_dev = [f_sent.split() for f_sent in open('dev.es', encoding='utf-8')]
Pairs={}
e_f = [e_sent.split() for e_sent in open('dev4.out', encoding='utf-8')]
f_e = [f_sent.split() for f_sent in open('dev_ef.out', encoding='utf-8')]
for i in e_f:
    t=(int(i[0]), int(i[1]), int(i[2]))
    if t in Pairs.keys():
        Pairs[t]+=1
    else:
        Pairs[t]=1
        
for i in f_e:
    t=(int(i[0]), int(i[2]), int(i[1]))
    if t in Pairs.keys():
        Pairs[t]+=1
    else:
        Pairs[t]=1

In [21]:
al=open("dev_int.out", "w", encoding='utf-8')
for i in list(Pairs.keys()):
    if Pairs[i]==2:
        al.write(str(i[0])+' '+str(i[1])+' '+str(i[2])+'\n')
al.close()

In [22]:
!python eval_alignment.py 'dev.key' 'dev_int.out'

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.794        0.367        0.502


In [27]:
Int={}; left_over={}
for i in list(Pairs.keys()):
    if Pairs[i]==2:
        if i[0] in Int.keys():
            Int[i[0]].append(i[2])
        else:
            Int[i[0]]=[i[2]]
    else:
        if i[0] in left_over.keys():
            left_over[i[0]].append((i[1], i[2]))
        else:
            left_over[i[0]]=[(i[1], i[2])] 

In [23]:
al=open("dev_int.out", "a", encoding='utf-8')
for i in Int.keys():
    for x in range(1, len(eng_dev[i-1])+1):
        if x not in Int[i]:
            for y in left_over[i]:
                if y[0]==x and (x not in Int[i]):
                    Int[i].append(x)
                    al.write(str(i)+' '+str(x)+' '+str(y[1])+'\n') 
                    
al.close()

In [24]:
!python eval_alignment.py 'dev.key' 'dev_int.out'

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.476        0.465        0.470


In [28]:
al=open("dev_int.out", "a", encoding='utf-8')
for i in Int.keys():
    for x in range(1, len(spn_dev[i-1])+1):
        if x not in Int[i]:
            for y in left_over[i]:
                if y[1]==x and (x not in Int[i]):
                    Int[i].append(x)
                    al.write(str(i)+' '+str(y[0])+' '+str(x)+'\n') 
                    
al.close()

In [29]:
!python eval_alignment.py 'dev.key' 'dev_int.out'

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.366        0.519        0.429


In [25]:
al=open("dev_un.out", "w", encoding='utf-8')
for i in list(Pairs.keys()):
    al.write(str(i[0])+' '+str(i[1])+' '+str(i[2])+'\n')
al.close()

In [26]:
!python eval_alignment.py 'dev.key' 'dev_un.out'

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.350        0.541        0.425
