In [None]:
# !!! MAKE SURE TO LOGIN ON HUGGINGFACE-CLI !!!

In [None]:
!sudo apt-get update && sudo apt-get install -y libgoogle-perftools-dev libsparsehash-dev unzip git cmake make

In [None]:
%pip install transformers datasets tqdm regex

In [None]:
import os
import regex as re
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers

OLD_LANGUAGE='en'
OLD_TOKENIZER='mistralai/Mistral-7B-v0.1'#'meta-llama/Llama-2-7b-hf'#

NEW_LANGUAGE='nl'
NEW_TOKENIZER='GroNLP/gpt2-small-dutch'#'FremyCompany/roberta-base-nl-oscar23'#

CORPUS_LIST = ['OpenSubtitles2018','NLLB']
CORPUS_LIST_STR = "_".join(CORPUS_LIST)

ALIGNMENT_UNIT = "WORDS" # "TOKENS" or "WORDS"
MIN_COUNT_REQUIRED_FOR_CONSIDERATION = 20

OLD_TOKENIZER_FRIENDLY_NAME=OLD_TOKENIZER.replace('/','--')
NEW_TOKENIZER_FRIENDLY_NAME=NEW_TOKENIZER.replace('/','--')

In [None]:
# load tokenizers for the two models
old_tokenizer = transformers.AutoTokenizer.from_pretrained(OLD_TOKENIZER)
new_tokenizer = transformers.AutoTokenizer.from_pretrained(NEW_TOKENIZER)

# save the vocabularies in a set for improved performance
old_tokenizer_vocab = set(old_tokenizer.vocab.keys())
new_tokenizer_vocab = set(new_tokenizer.vocab.keys())

# determine the tokenizer settings
OLD_TOKENIZER_1ST_PREFIX = (old_tokenizer.convert_ids_to_tokens(old_tokenizer.encode(" a", add_special_tokens=False)[0]).rstrip("a"))
NEW_TOKENIZER_1ST_PREFIX = (new_tokenizer.convert_ids_to_tokens(new_tokenizer.encode(" a", add_special_tokens=False)[0]).rstrip("a"))
OLD_TOKENIZER_2ND_PREFIX = (old_tokenizer.convert_ids_to_tokens(old_tokenizer.encode("aaaaaaaaaaaaaaaaaaaaaa", add_special_tokens=False)[1]).rstrip('a'))
NEW_TOKENIZER_2ND_PREFIX = (new_tokenizer.convert_ids_to_tokens(new_tokenizer.encode("aaaaaaaaaaaaaaaaaaaaaa", add_special_tokens=False)[1]).rstrip('a'))


In [None]:
# preprocess a parallel corpus, to transform it into a space-delimited corpus of tokens

os.makedirs(f'alignments/', exist_ok=True)
os.makedirs(f'corpora/parallel/', exist_ok=True)
os.makedirs(f'corpora/monolingual/', exist_ok=True)

if os.path.exists(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.fast_align.tsv'):
    print(f'data already aligned')

else:

    if not(os.path.exists(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.moses')):

        #
        # step 1: download the parallel corpus (if not already downloaded)
        #

        def download_corpus_from_nlpl_eu(corpus, corpus_folder=None, corpus_version='v1'):
            corpus_folder = corpus_folder if corpus_folder is not None else corpus
            if os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}') and os.path.exists(f'NLLB/NLLB.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}'):
                print(f'{corpus}: data already downloaded')
            else:
                print(f'{corpus}: downloading data...')
                os.makedirs(f'corpora/parallel/{corpus}', exist_ok=True)
                if not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.txt.zip'):
                    os.system(f'wget -O corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.txt.zip https://opus.nlpl.eu/download.php?f={corpus_folder}/{corpus_version}/moses/{OLD_LANGUAGE}-{NEW_LANGUAGE}.txt.zip')
                os.system(f'cd corpora/parallel/{corpus} && unzip {corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.txt.zip')
                # check if that worked
                if not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}') or not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}'):
                    # check if the files are there but with corpus_folder as their prefix
                    if os.path.exists(f'corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}') and os.path.exists(f'corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}'):
                        os.system(f'mv corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE} corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}')
                        os.system(f'mv corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE} corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}')
                        return
                    # check if the zip file was empty
                    if os.path.getsize(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.txt.zip') == 0:
                        # try with the language codes swapped
                        os.system(f'rm corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.txt.zip')
                        if not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.txt.zip'):
                            os.system(f'wget -O corpora/parallel/{corpus}/{corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.txt.zip https://opus.nlpl.eu/download.php?f={corpus_folder}/{corpus_version}/moses/{NEW_LANGUAGE}-{OLD_LANGUAGE}.txt.zip')
                        os.system(f'cd corpora/parallel/{corpus} && unzip {corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.txt.zip')
                    if not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.{OLD_LANGUAGE}') or not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.{NEW_LANGUAGE}'):
                        # check if the files are there but with corpus_folder as their prefix
                        if os.path.exists(f'corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}') and os.path.exists(f'corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}'):
                            os.system(f'mv corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE} corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}')
                            os.system(f'mv corpora/parallel/{corpus}/{corpus_folder}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE} corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}')
                        else:
                            print(f'{corpus}: could not download data')
                            raise Exception(f'Corpus not found; download files corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE} and corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}')
                    else:
                        os.system(f'mv corpora/parallel/{corpus}/{corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.{OLD_LANGUAGE} corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}')
                        os.system(f'mv corpora/parallel/{corpus}/{corpus}.{NEW_LANGUAGE}-{OLD_LANGUAGE}.{NEW_LANGUAGE} corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}')

        for corpus in CORPUS_LIST:
            if corpus == 'NLLB':
                download_corpus_from_nlpl_eu('NLLB')
            if corpus == 'OpenSubtitles2018':
                download_corpus_from_nlpl_eu('OpenSubtitles2018','OpenSubtitles','v2018')
        
        for corpus in CORPUS_LIST:
            if not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}') or not os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}'):
                print(f'{corpus}: corpus not found')
                raise Exception(f'Corpus not found; download files corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE} and corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}')

        #
        # step 2: preprocess the parallel corpus
        #
        
        for corpus in CORPUS_LIST:
            if os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}.txt'):
                print(f'{corpus}: data already preprocessed for tokenizer {OLD_TOKENIZER}')
            else:
                print(f'{corpus}: preprocessing data for tokenizer {OLD_TOKENIZER}...')
                with open(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}.txt','w') as f:
                    with open(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}') as g:
                        for line in tqdm(g):
                            f.write(' '.join(old_tokenizer.tokenize(line.strip()))+'\n')
                            
            if os.path.exists(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}.{NEW_TOKENIZER_FRIENDLY_NAME}.txt'):
                print(f'{corpus}: data already preprocessed for tokenizer {NEW_TOKENIZER}')
            else:
                print(f'{corpus}: preprocessing data for tokenizer {NEW_TOKENIZER}...')
                with open(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}.{NEW_TOKENIZER_FRIENDLY_NAME}.txt','w') as f:
                    with open(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}') as g:
                        for line in tqdm(g):
                            f.write(' '.join(new_tokenizer.tokenize(line.strip()))+'\n')

    #
    # step 3: combine the two tokenized corpora into a single parallel corpus in the format expected by fast_align (a1 a2 a3 ||| b1 b2 b3)
    #

    if os.path.exists(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.moses'):
        print(f'data already preprocessed for fast_align')
    else:
        print(f'preprocessing data for fast_align...')
        with open(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.moses','w') as f:
            for corpus in CORPUS_LIST:
                with open(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}.txt') as g:
                    with open(f'corpora/parallel/{corpus}/{corpus}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{NEW_LANGUAGE}.{NEW_TOKENIZER_FRIENDLY_NAME}.txt') as h:
                        for line1,line2 in tqdm(zip(g,h)):
                            if ALIGNMENT_UNIT == 'WORDS':
                                # merging tokens from word units, for a better alignment
                                line1 = re.sub(r'(?!'+OLD_TOKENIZER_1ST_PREFIX+r')(\p{L})[ ](?!'+OLD_TOKENIZER_1ST_PREFIX+r')(?='+OLD_TOKENIZER_2ND_PREFIX+r'\p{L})',r'\1—',line1)
                                line2 = re.sub(r'(?!'+NEW_TOKENIZER_1ST_PREFIX+r')(\p{L})[ ](?!'+NEW_TOKENIZER_1ST_PREFIX+r')(?='+NEW_TOKENIZER_2ND_PREFIX+r'\p{L})',r'\1—',line2)
                            f.write(line1.strip()+' ||| '+line2.strip()+'\n')
    #
    # step 4: download fast_align if not already downloaded
    #
    if not os.path.exists('fast_align/build/fast_align'):
        print(f'downloading fast_align...')
        os.system('apt-get install -y libgoogle-perftools-dev libsparsehash-dev')
        os.system('git clone https://github.com/FremyCompany/fast_align.git')
        os.system('mkdir fast_align/build')
        os.system('cd fast_align/build && cmake .. && make')

    #
    # step 5: using fast_align to align the two tokenized corpora
    #
    if os.path.exists(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.fast_align.tsv'):
        print(f'data already aligned')
    else:
        os.system(f"./fast_align/build/fast_align -I 7 -p alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.fast_align.tsv -i alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.moses > /dev/null")



In [None]:
# use the alignments to create a weighted dictionary of possible translations for each token

import math
import numpy as np
from collections import defaultdict

tokenized_possible_translations = defaultdict(lambda: defaultdict(int))
untokenized_possible_translations = defaultdict(lambda: defaultdict(int)) # only filled when ALIGNMENT_UNIT is 'WORDS', and for diagnostics purposes only

def add_token_pair(count, new_token, old_token):
    tokenized_possible_translations[new_token][old_token] += count

def add_word_pair(count, new_word, old_word, all_to_all_mapping=False):
    # tokenize the words
    # (recall that we use the long hyphen to replace spaces inside words, to merge the tokens again)
    old_word_tokenized = old_word.split('—')
    new_word_tokenized = new_word.split('—')

    # if the token list dont have the same length, compute the smallest common multiple of their lengths
    if all_to_all_mapping:
        count_dilution = len(old_word_tokenized)
        old_word_tokenized = np.tile(old_word_tokenized, len(new_word_tokenized))
        new_word_tokenized = np.repeat(new_word_tokenized, count_dilution)
    elif len(old_word_tokenized) != len(new_word_tokenized):
        gcd = math.gcd(len(old_word_tokenized), len(new_word_tokenized))
        count_dilution = len(old_word_tokenized) // gcd
        old_word_tokenized = np.repeat(old_word_tokenized, len(new_word_tokenized) // gcd)
        new_word_tokenized = np.repeat(new_word_tokenized, count_dilution)
    else:
        gcd = 1
        count_dilution = 1

    # perform this operation for each token pair in the word
    for token_old, token_new in zip(old_word_tokenized, new_word_tokenized):
        # add the translation to the dictionary
        tokenized_possible_translations[token_new][token_old] += max(1, count // count_dilution)

total_alignments = 0
with open(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.fast_align.tsv') as f:
    for line in f: total_alignments += 1

with open(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}.fast_align.tsv') as f:
    for line in tqdm(f, total=total_alignments):
        # remove the newline character
        line = line.rstrip('\n')
        # skip empty lines
        if line == '': continue
        # split the line on the tab character
        old_word, new_word, log_prob, count = line.split('\t')
        # reject <eps> mappings
        if old_word == '<eps>': continue
        if new_word == '<eps>': continue
        # convert the count to an integer
        count = int(float(count))
        # skip pairs that happened rarely (likely noise)
        if count < MIN_COUNT_REQUIRED_FOR_CONSIDERATION: continue
        # add the token pair to the token dictionary
        if (ALIGNMENT_UNIT != 'WORDS') or ((new_word in new_tokenizer_vocab) and (old_word in old_tokenizer_vocab)):
            add_token_pair(count, new_word, old_word)
        else:
            half_count = max(1, count // 2)
            add_word_pair(half_count, new_word, old_word, all_to_all_mapping=True)
            add_word_pair(half_count, new_word, old_word, all_to_all_mapping=False)
        # add the word translation to the dictionary (for diagnostics purposes only)
        untokenized_possible_translations[new_word][old_word] += count

# add a mapping for all numbers
for i in range(9999):
    str_i = str(i)
    if str_i in new_tokenizer_vocab:
        add_token_pair(1, str_i, str_i if str_i in old_tokenizer_vocab else old_tokenizer.tokenize(str_i)[0])
    if len(new_tokenizer.tokenize(str_i)) == 1:
        add_token_pair(1, new_tokenizer.tokenize(str_i)[0], old_tokenizer.tokenize(str_i)[0])
    if len(new_tokenizer.tokenize(' ' + str_i)) == 1:
        add_token_pair(1, new_tokenizer.tokenize(' ' + str_i)[0], old_tokenizer.tokenize(' ' + str_i)[0])
for i in range(99):
    str_i = '0' + str(i)
    if str_i in new_tokenizer_vocab:
        add_token_pair(1, str_i, str_i if str_i in old_tokenizer_vocab else old_tokenizer.tokenize(str_i)[0])
    if len(new_tokenizer.tokenize(str_i)) == 1:
        add_token_pair(1, new_tokenizer.tokenize(str_i)[0], old_tokenizer.tokenize(str_i)[0])
    if len(new_tokenizer.tokenize(' ' + str_i)) == 1:
        add_token_pair(1, new_tokenizer.tokenize(' ' + str_i)[0], old_tokenizer.tokenize(' ' + str_i)[0])

# add a mapping for all punctuation (and words that exist in both models)
for token in new_tokenizer_vocab:
    ## skip if any token char is a letter or digit
    #if any(c.isalnum() for c in token): continue
    # replace the start symbol of the new model with the one of the old model
    if NEW_TOKENIZER_1ST_PREFIX != '' or OLD_TOKENIZER_1ST_PREFIX != '':
        token2 = token.replace(NEW_TOKENIZER_1ST_PREFIX, OLD_TOKENIZER_1ST_PREFIX)
    # replace the continuation symbol of the new model with the one of the old model
    if NEW_TOKENIZER_2ND_PREFIX != '' or OLD_TOKENIZER_2ND_PREFIX != '':
        token2 = token2.replace(NEW_TOKENIZER_2ND_PREFIX, OLD_TOKENIZER_2ND_PREFIX)
    # skip if token is not in the old model
    if token2 not in old_tokenizer_vocab: continue
    # add the mapping
    tokenized_possible_translations[token][token2] += 1

def or_old_unk_token(token, fallback_token=None):
    if (token != None) and (token in old_tokenizer_vocab): return token
    if (fallback_token != None) and (fallback_token in old_tokenizer_vocab): return fallback_token
    return old_tokenizer.unk_token

# add a mapping for special tokens (i.e. pad, unk, bos, eos, sep, cls, mask)
very_large_number = 1_000_000_000
if new_tokenizer.pad_token != None: add_token_pair(very_large_number, new_tokenizer.pad_token, or_old_unk_token(old_tokenizer.pad_token))
if new_tokenizer.unk_token != None: add_token_pair(very_large_number, new_tokenizer.unk_token, or_old_unk_token(old_tokenizer.unk_token))
if new_tokenizer.bos_token != None: add_token_pair(very_large_number, new_tokenizer.bos_token, or_old_unk_token(old_tokenizer.bos_token, old_tokenizer.cls_token))
if new_tokenizer.eos_token != None: add_token_pair(very_large_number, new_tokenizer.eos_token, or_old_unk_token(old_tokenizer.eos_token, old_tokenizer.sep_token))
if new_tokenizer.cls_token != None: add_token_pair(very_large_number, new_tokenizer.cls_token, or_old_unk_token(old_tokenizer.cls_token, old_tokenizer.bos_token))
if new_tokenizer.sep_token != None: add_token_pair(very_large_number, new_tokenizer.sep_token, or_old_unk_token(old_tokenizer.sep_token, old_tokenizer.eos_token))
if new_tokenizer.mask_token != None: add_token_pair(very_large_number, new_tokenizer.mask_token, or_old_unk_token(old_tokenizer.mask_token, old_tokenizer.pad_token))

In [None]:
def get_coefficients_for_token(new_token):
    # get the possible translations for this token
    possible_translations = tokenized_possible_translations[new_token]
    # get the total count of all translations
    total_count = sum(possible_translations.values())
    # compute the probability of each translation
    probabilities = {old_token: count / total_count for old_token, count in possible_translations.items()}
    # sort the translations by probability
    probabilities = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
    # return the probabilities
    return probabilities

In [None]:
# print some pairs to see if it worked
def print_get_coefficients_for_token(token):
    print(f'{token}: {get_coefficients_for_token(token)}')

print_get_coefficients_for_token('Ġromige')
print_get_coefficients_for_token('ĠRomige')

print_get_coefficients_for_token('Ġkleine')
print_get_coefficients_for_token('ĠKleine')

print_get_coefficients_for_token('zee')
print_get_coefficients_for_token('Ġzee')

print_get_coefficients_for_token('ster')
print_get_coefficients_for_token('Ġster')

In [None]:
# check how many tokens have a translation, compared to the total number of tokens
print(f'Number of tokens with a translation: {len(tokenized_possible_translations)}')
print(f'Number of tokens without a translation: {len(new_tokenizer)}')
print(f'Percentage of tokens with a translation: {int(len(tokenized_possible_translations) / len(new_tokenizer) * 1000)/10}%')

tokenized_possible_translations_step2 = tokenized_possible_translations.copy()

## for tokens with start with a space, double the weight of their translations that also start with a space
#for token, translations in tokenized_possible_translations_step2.items():
#    if token.startswith(NEW_TOKENIZER_1ST_PREFIX):
#        for translation, count in translations.items():
#            if translation.startswith(OLD_TOKENIZER_1ST_PREFIX):
#                translations[translation] *= 2

# print the first 100 tokens that have no translation
tmp_count = 0
for i, token in enumerate(new_tokenizer.get_vocab()):
    #if tmp_count >= 100: break
    if token not in tokenized_possible_translations:
        tmp_count += 1
        # provide a list of tokens which start with the same characters
        similar_tokens = [token2 for token2 in new_tokenizer.get_vocab() if token2.startswith(token) and (token2 in tokenized_possible_translations)]
        ## find the tokens which are the start of this token
        #start_subset_tokens = [token2 for token2 in tokenized_possible_translations if token.startswith(token2) and (token2 in tokenized_possible_translations)]
        #start_subset_tokens.sort(key=lambda x: len(x), reverse=True)
        ## find the tokens which are the end of this token
        #end_subset_tokens = [token2 for token2 in tokenized_possible_translations if token.endswith(token2) and (token2 in tokenized_possible_translations)]
        #end_subset_tokens.sort(key=lambda x: len(x), reverse=True)
        # find the tokens which are the middle of this token
        middle_subset_tokens = [token2 for token2 in tokenized_possible_translations if (token2 in token) and (token2 in tokenized_possible_translations)]
        middle_subset_tokens.sort(key=lambda x: len(x), reverse=True)
        # remove the tokens which are included in another previous token of the list
        #start_subset_tokens = [token2 for i, token2 in enumerate(start_subset_tokens) if (i == 0) or not any([token2 in token3 for token3 in start_subset_tokens[0:i]])]
        #end_subset_tokens = [token2 for i, token2 in enumerate(end_subset_tokens) if (i == 0) or not any([token2 in token3 for token3 in end_subset_tokens[0:i]])]
        middle_subset_tokens = [token2 for i, token2 in enumerate(middle_subset_tokens) if (i == 0) or not any([token2 in token3 for token3 in middle_subset_tokens[0:i]])]
        # sort the middle tokens by position in the token
        middle_subset_tokens.sort(key=lambda x: token.index(x))
        # print the token, the similar tokens, and the start, end, and middle subset tokens
        if tmp_count <= 100: print(token, similar_tokens, middle_subset_tokens) #start_subset_tokens[0:3], end_subset_tokens[0:3], middle_subset_tokens[0:3])
        # add the token to the updated dictionary
        if len(similar_tokens) == 0 and len(middle_subset_tokens) == 0: continue
        tokenized_possible_translations_step2[token] = defaultdict(int)
        for token2 in similar_tokens + middle_subset_tokens:
            # add all their translation to the dictionary, normalizing to a total count of 1000 for each token2 (2000 if the token starts with a space)
            count_for_token2 = sum(tokenized_possible_translations[token2].values())
            if count_for_token2 > 0:
                for translation_of_token2 in tokenized_possible_translations[token2]:
                    weight = 2000 if translation_of_token2.startswith('▁') else 1000
                    tokenized_possible_translations_step2[token][translation_of_token2] += max(1, (weight * tokenized_possible_translations[token2][translation_of_token2]) // count_for_token2)

In [None]:
# check how many tokens have a translation, compared to the total number of tokens
print(f'Number of tokens with a translation: {len(tokenized_possible_translations_step2)}')
print(f'Number of tokens without a translation: {len(new_tokenizer)}')
print(f'Percentage of tokens with a translation: {int(len(tokenized_possible_translations_step2) / len(new_tokenizer) * 1000)/10}%')

# print the first 100 tokens that have no translation
tmp_count = 0
for i, token in enumerate(new_tokenizer.get_vocab()):
    #if tmp_count >= 100: break
    if token not in tokenized_possible_translations_step2:
        tmp_count += 1
        print(token)

In [None]:
import json

def get_coefficients_for_token_step2(new_token):
    # check for unmapped tokens
    if new_token not in tokenized_possible_translations_step2: return [(old_tokenizer.unk_token, 1.0)]
    # get the possible translations for this token
    possible_translations = tokenized_possible_translations_step2[new_token]
    # get the total count of all translations
    total_count = sum(possible_translations.values())
    # check for unmapped tokens by count
    if total_count <= 0: return [(old_tokenizer.unk_token, 1.0)]
    # compute the probability of each translation
    probabilities = {old_token: count / total_count for old_token, count in possible_translations.items()}
    # sort the translations by probability
    probabilities = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
    # return the probabilities
    return probabilities

# convert the dictionary to a list of sorted lists, and save it to a json file
final_list = []
for token_i in range(len(new_tokenizer.vocab)):
    token = new_tokenizer.convert_ids_to_tokens(token_i)
    coefficients = get_coefficients_for_token_step2(token)
    final_list.append((token, coefficients))
json.dump(final_list, open(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}-{MIN_COUNT_REQUIRED_FOR_CONSIDERATION}.token_mapping.json', 'w'), indent='\t')

# created a more compact version of the json file
final_list_compact = {token: ",  ".join([f'{old_token} {int(100*probability)}%' for old_token, probability in coefficients]) for token, coefficients in final_list}
#json.dump(final_list_compact, open(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}-{MIN_COUNT_REQUIRED_FOR_CONSIDERATION}.token_mapping.json.compact.json', 'w'), indent='\t')
with open(f'alignments/{CORPUS_LIST_STR}.{OLD_LANGUAGE}-{NEW_LANGUAGE}.{OLD_TOKENIZER_FRIENDLY_NAME}-{NEW_TOKENIZER_FRIENDLY_NAME}-{ALIGNMENT_UNIT}-{MIN_COUNT_REQUIRED_FOR_CONSIDERATION}.token_mapping.json.compact.txt', 'w') as f:
    for key, value in final_list_compact.items():
        f.write(f'{key}\t{value}\n')