In [1]:
import os
import sys
import codecs

In [2]:
path_to_alignments = '/usr1/home/ssandeep/corpora/aligned.grow-diag-final-and.from_all'
path_to_en = '/usr1/home/ssandeep/corpora/news-commentary-v8.fr-en.clean.en'
path_to_fr = '/usr1/home/ssandeep/corpora/news-commentary-v8.fr-en.clean.fr'

In [3]:
alignments = [line.strip().split() for line in open(path_to_alignments, 'r')]
english_sents = [line.strip().split() for line in codecs.open(path_to_en, 'r', encoding='utf8')]
french_sents = [line.strip().split() for line in codecs.open(path_to_fr, 'r', encoding='utf8')]

In [4]:
assert len(alignments) == len(english_sents) == len(french_sents)

In [5]:
def parse_alignment(alignment):
    forward_alignment = {}
    backward_alignment = {}
    for alignment_pair in alignment:
        alignment_pair = [int(x) for x in alignment_pair.split('-')]
        if alignment_pair[0] not in forward_alignment:
            forward_alignment[alignment_pair[0]] = [alignment_pair[1]]
        else:
            forward_alignment[alignment_pair[0]].append(alignment_pair[1])
        if alignment_pair[1] not in backward_alignment:
            backward_alignment[alignment_pair[1]] = [alignment_pair[0]]
        else:
            backward_alignment[alignment_pair[1]].append(alignment_pair[0])
    return forward_alignment, backward_alignment

In [6]:
def get_not_in_target(alignment):
    targets = [int(x.split('-')[0]) for x in alignment]
    return set(range(max(targets))) - set(targets)

In [12]:
#print '==============================================================================================='
actions = []
for ind, (alignment, english_sent, french_sent) in enumerate(zip(alignments, english_sents, french_sents)[:5]):
    
    if ind % 1000 == 0:
        print 'Finished %d out of %d ' %(ind, len(english_sents))
    indices = set()
    backward_alignment, forward_alignment = parse_alignment(alignment)
    not_in_target = get_not_in_target(alignment)
    target_start = 0
    source_start = 0
    actionset = []
    
    for ind, word in enumerate(english_sent):
        
        # If word in source doesn't have an alignment, SHIFT
        if ind not in forward_alignment:
            print u'{: <2} | {: <20} | {: <2} | {: <30} | {: <30} '.format(ind, word, 'NS', ' '.join(english_sent[source_start:ind+1]), 'NULL')
            actionset.append([word, 'S', english_sent[source_start:ind+1], ''])
            continue
        
        # Add all the alignments for this word to indices
        for item in forward_alignment[ind]:
            indices.add(item)
        action = 'T'
        max_ind = max(indices)
        
        # Check if all target indices of the max spanning source block alignment are contained if not, SHIFT
        for i in range(target_start, max_ind):
            if i in not_in_target:
                continue
            if i not in indices:
                action = 'S'
                break
        
        # Write the SHIFT action to the actionset
        if action == 'S':
            print u'{: <2} | {: <20} | {: <2} | {: <30} | {: <30} | {: <10}  '.format(ind, word, action, ' '.join(english_sent[source_start:ind+1]), 'NULL', str(target_start) + ', ' + str(max_ind) + ' -> ' + ' '.join([str(x) for x in indices]))
            actionset.append([word, action, english_sent[source_start:ind+1], ''])
        
        # Write the TRANSLATE action to the actionset
        elif action == 'T':
            translation = french_sent[target_start:max_ind+1]
            curr_phrase_block = english_sent[source_start:ind+1]
            print u'{: <2} | {: <20} | {: <2} | {: <30} | {: <30} | {: <10} '.format(ind, word, action, ' '.join(curr_phrase_block), ' '.join(translation), str(target_start)  + ' -> ' + str(max_ind))
            actionset.append([word, action, curr_phrase_block, translation])
            target_start = max(max_ind + 1, target_start) # Max to handle the case where a word in the source maps backwards in the target
            source_start = ind + 1
            indices = set()
    #print ' '.join(french_sent)
    print '==============================================================================================='
    actions.append(actionset)

Finished 0 out of 155362 
0  | San                  | T  | San                            | San                            | 0 -> 0     
1  | FRANCISCO            | T  | FRANCISCO                      | FRANCISCO                      | 1 -> 1     
2  | –                    | S  | –                              | NULL                           | 2, 5 -> 2 4 5  
3  | It                   | T  | – It                           | – Il n ’                       | 2 -> 5     
4  | has                  | T  | has                            | a                              | 6 -> 6     
5  | never                | T  | never                          | jamais                         | 7 -> 7     
6  | been                 | T  | been                           | été                            | 8 -> 8     
7  | easy                 | T  | easy                           | facile                         | 9 -> 9     
8  | to                   | T  | to                             | d               