In [1]:
from pathlib import Path
from datasets import load_dataset
import transformers
import os
from tqdm import tqdm

from transtokenizers.transtokenizers import get_dataset_iterator

home_path = os.environ['TT_HOME'] if "TT_HOME" in os.environ else Path("export")


def create_aligned_corpus(
    source_language: str,
    target_language: str,
    corpus_list: list = ['open_subtitles', 'allenai/nllb'],
):
    "Replacement of the original function without tokenizers to just get a parallel corpus in moses format"
    corpus_list_description = "_".join(corpus_list).replace("/", "--")

    out_path = f'{home_path}/alignments/{corpus_list_description}.{source_language}-{target_language}.not-tokenized.moses'

    os.makedirs(f'{home_path}/alignments', exist_ok=True)
    for corpus in corpus_list:
        with open(out_path, 'a') as f:
            dataset = get_dataset_iterator(corpus, source_language, target_language)
            for line_source, line_target in tqdm(dataset):
                
                line1 = line_source.strip()
                line2 = line_target.strip()

                f.write(line1.strip() + ' ||| ' + line2.strip() + '\n')

    return f'{home_path}/alignments/{corpus_list_description}.{source_language}-{target_language}.not-tokenized.moses'


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transtokenizers.transtokenizers import align

corpus = create_aligned_corpus(
        source_language="en",
        target_language="nl")


Downloading builder script: 100%|██████████| 6.22k/6.22k [00:00<00:00, 20.9MB/s]
Downloading readme: 100%|██████████| 7.45k/7.45k [00:00<00:00, 13.1MB/s]
3167683it [14:26, 3654.66it/s]


KeyboardInterrupt: 

In [13]:
fast_align_path  = f"{home_path}/notebooks/fast_align/build/fast_align"

mapped_tokens_file = align(corpus, fast_align_path=fast_align_path)

../fast_align/build/fast_align: invalid option -- 'h'
ARG=?
Usage: ../fast_align/build/fast_align -i file.fr-en
 Standard options ([USE] = strongly recommended):
  -i: [REQ] Input parallel corpus
  -v: [USE] Use Dirichlet prior on lexical translation distributions
  -d: [USE] Favor alignment points close to the monotonic diagonoal
  -o: [USE] Optimize how close to the diagonal alignment points should be
  -r: Run alignment in reverse (condition on target and predict source)
  -c: Output conditional probability table
 Advanced options:
  -I: number of iterations in EM training (default = 5)
  -q: p_null parameter (default = 0.08)
  -N: No null word
  -a: alpha parameter for optional Dirichlet prior (default = 0.01)
  -T: starting lambda for diagonal distance parameter (default = 4)
  -s: print alignment scores (alignment ||| score, disabled by default)
ARG=i
ARG=I
ARG=p
INITIAL PASS 
.................................................. [50000]
.............................................

In [17]:
from transtokenizers.transtokenizers import MIN_COUNT_REQUIRED_FOR_CONSIDERATION

tokens = []
for line in tqdm(open(mapped_tokens_file)):
    # remove the newline character
    line = line.rstrip('\n')
    # skip empty lines
    if line == '':
        continue
    # split the line on the tab character
    old_word, new_word, log_prob, count = line.split('\t')
    # reject <eps> mappings
    if old_word == '<eps>':
        continue
    if new_word == '<eps>':
        continue
    # convert the count to an integer
    count = int(float(count))
    # skip pairs that happened rarely (likely noise)
    if count < MIN_COUNT_REQUIRED_FOR_CONSIDERATION:
        continue
    else:
        tokens.append({'old': old_word, 'new': new_word, 'count': count, 'log_prob': log_prob})

7086939it [00:07, 1011696.96it/s]


In [19]:
import pandas as pd

df = pd.DataFrame(tokens)

In [23]:
# fix dtype

df['log_prob'] = df['log_prob'].astype(float)

In [30]:
df[df['log_prob'] > -2] # find best translations and aim for 32k tokens (Same as WECHSEL)

Unnamed: 0,old,new,count,log_prob
0,A,Een,15670,-0.221481
1,killer,moordenaar,162,-0.633279
2,asteroid,asteroïde,32,-1.021510
5,to,te,64497,-1.281130
8,to,om,34652,-1.902380
...,...,...,...,...
76879,Haddy.,Haddy.,10,-0.667344
76880,"Pazu,","Pazu,",11,-0.832077
76881,"Sheeta,","Sheeta,",13,-0.959021
76882,Guapo.,Guapo.,11,-0.777018


In [31]:
df[df['log_prob'] > -2][['old', 'new']].to_csv(f'{home_path}/alignments/translation-table.en-nl.tsv', sep='\t', index=False, header=False)