In [1]:
from tqdm import tqdm
from sacremoses import MosesTokenizer
import morfessor
import math

## Tr

In [2]:
mt = MosesTokenizer(lang='tr')
io = morfessor.MorfessorIO()
train_data = list(io.read_corpus_file('/data2/Anypair/joeynmt/test/chain_translation/tr_ru/train.tr'))

In [3]:
corpus = []
with open('/data2/Anypair/joeynmt/test/chain_translation/tr_ru/train.tr','r') as f:
    for line in f:
        corpus.append(line)

In [4]:
len(corpus)

2000000

In [5]:
all_unique_letters = []
for line in tqdm(corpus):
    mosesed = mt.tokenize(line, return_str=True)
    sentence = mosesed.strip()
    for word in sentence.split(" "):
        for letter in word:
            if letter not in all_unique_letters:
                all_unique_letters.append(letter)
                

100%|██████████| 2000000/2000000 [01:25<00:00, 23394.53it/s]


In [6]:
all_unique_letters[:30]

['h',
 'a',
 's',
 't',
 'l',
 'ı',
 'k',
 'r',
 'n',
 'm',
 'z',
 'e',
 'ğ',
 'b',
 'd',
 'ş',
 'v',
 'y',
 'ü',
 'i',
 'g',
 'u',
 'p',
 'c',
 'ç',
 'o',
 'f',
 'ö',
 '̇',
 '’']

In [8]:
model_types = morfessor.BaselineModel()
model_logtokens = morfessor.BaselineModel()
model_tokens = morfessor.BaselineModel()

model_types.load_data(train_data, count_modifier=lambda x: 1)
def log_func(x):
    return int(round(math.log(x + 1, 2)))
model_logtokens.load_data(train_data, count_modifier=log_func)
model_tokens.load_data(train_data[:1000])

models = [model_types, model_logtokens, model_tokens]

for model in models:
    model.train_batch()

...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
........................................

In [9]:
tr_cyrillic = {'a':'а', 'b':'б', 'v':'в', 'g':'г', 'ğ':'г', 'd':'д', 'e':'е', 'j':'ж', 
               'i':'и', 'y':'й', 'k':'к', 'l':'л', 'm':'м', 'n':'н', 'o':'о', 'ö':'ө', 'p':'п',
              'r':'р', 's':'с', 't':'т', 'u':'у', 'ü':'у', 'f':'ф', 'h':'x', 'ç':'ч', 'ş':'ш',
              'ı':'ы'}

In [33]:
prep = []
for line in tqdm(corpus):
    sentence = line.strip()
    sentence_words = []
    for word in sentence.split(" "):
        cyrillic_word = ''
        try:
            for letter in model.viterbi_segment(word)[0][0]:
                try:
                    cyr_letter = tr_cyrillic[letter]
                    cyrillic_word += cyr_letter
                except:
                    pass
            sentence_words.append(cyrillic_word)
        except:
            pass
    prep.append(' '.join(sentence_words))
        

100%|██████████| 2000000/2000000 [10:31<00:00, 3167.97it/s]


In [34]:
with open('/data2/Anypair/joeynmt/test/chain_translation/morfessor_cyrillic_tr_ru/train.tr','w') as f:
    for line in prep:
        f.write(line)
        f.write('\n')

## Validation preprocessing

In [35]:
validation_data = []
with open('/data2/Anypair/joeynmt/test/chain_translation/tr_ru/dev.tr','r') as f:
    for line in f:
        validation_data.append(line)

In [36]:
prep_val = []
for line in tqdm(validation_data):
    sentence = line.strip()
    sentence_words = []
    for word in sentence.split(" "):
        cyrillic_word = ''
        try:
            for letter in model.viterbi_segment(word)[0][0]:
                try:
                    cyr_letter = tr_cyrillic[letter]
                    cyrillic_word += cyr_letter
                except:
                    pass
            sentence_words.append(cyrillic_word)
        except:
            pass
    prep_val.append(' '.join(sentence_words))

100%|██████████| 10000/10000 [00:02<00:00, 3346.95it/s]


In [37]:
with open('/data2/Anypair/joeynmt/test/chain_translation/morfessor_cyrillic_tr_ru/dev.tr','w') as f:
    for line in prep:
        f.write(line)
        f.write('\n')

## Test preprocessing

In [38]:
test_data = []
with open('/data2/Anypair/joeynmt/test/chain_translation/tr_ru/test.tr','r') as f:
    for line in f:
        test_data.append(line)

In [39]:
prep_test = []
for line in tqdm(test_data):
    sentence = line.strip()
    sentence_words = []
    for word in sentence.split(" "):
        cyrillic_word = ''
        try:
            for letter in model.viterbi_segment(word)[0][0]:
                try:
                    cyr_letter = tr_cyrillic[letter]
                    cyrillic_word += cyr_letter
                except:
                    pass
            sentence_words.append(cyrillic_word)
        except:
            pass
    prep_test.append(' '.join(sentence_words))

100%|██████████| 10000/10000 [00:10<00:00, 969.63it/s]


In [40]:
with open('/data2/Anypair/joeynmt/test/chain_translation/morfessor_cyrillic_tr_ru/test.tr','w') as f:
    for line in prep:
        f.write(line)
        f.write('\n')

## Ky

In [6]:
io = morfessor.MorfessorIO()
train_data = list(io.read_corpus_file('/data/Anypair/joeynmt/test/data_ky_ru/data/train.ky'))

In [7]:
model_types = morfessor.BaselineModel()
model_logtokens = morfessor.BaselineModel()
model_tokens = morfessor.BaselineModel()

model_types.load_data(train_data, count_modifier=lambda x: 1)
def log_func(x):
    return int(round(math.log(x + 1, 2)))
model_logtokens.load_data(train_data, count_modifier=log_func)
model_tokens.load_data(train_data[:1000])

models = [model_types, model_logtokens, model_tokens]

for model in models:
    model.train_batch()

...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
........................................................
........................................................
..............................................

In [8]:
train_data = []
with open('/data/Anypair/joeynmt/test/data_ky_ru/data/train.ky','r') as f:
    for line in f:
        train_data.append(line)
        
dev_data = []
with open('/data/Anypair/joeynmt/test/data_ky_ru/data/dev.ky','r') as f:
    for line in f:
        dev_data.append(line)

test_data = []
with open('/data/Anypair/joeynmt/test/data_ky_ru/data/test.ky','r') as f:
    for line in f:
        test_data.append(line)

In [23]:
prep_train = []
for line in tqdm(train_data):
    sentence = line.strip()
    sentence_words = []
    for word in sentence.split(" "):
        try:
            morf =  model.viterbi_segment(word)[0][0]
            sentence_words.append(morf)
        except:
            pass
    prep_train.append(' '.join(sentence_words))

prep_dev = []
for line in tqdm(dev_data):
    sentence = line.strip()
    sentence_words = []
    for word in sentence.split(" "):
        try:
            morf =  model.viterbi_segment(word)[0][0]  
            sentence_words.append(morf)
        except:
            pass
    prep_dev.append(' '.join(sentence_words))

prep_test = []
for line in tqdm(test_data):
    sentence = line.strip()
    sentence_words = []
    for word in sentence.split(" "):
        try:
            morf =  model.viterbi_segment(word)[0][0] 
            sentence_words.append(morf)
        except:
            pass
    prep_test.append(' '.join(sentence_words))

100%|██████████| 150000/150000 [02:18<00:00, 1084.67it/s]
100%|██████████| 9741/9741 [00:08<00:00, 1089.01it/s]
100%|██████████| 9723/9723 [00:09<00:00, 1080.27it/s]


In [26]:
with open('/data/Anypair/joeynmt/test/data_ky_ru/data_plus_morfessor_cyrillic_tr_and_ky/morfessed_train.ky','w') as f:
    for line in prep_train:
        f.write(line)
        f.write('\n')
    
with open('/data/Anypair/joeynmt/test/data_ky_ru/data_plus_morfessor_cyrillic_tr_and_ky/morfessed_dev.ky','w') as f:
    for line in prep_dev:
        f.write(line)
        f.write('\n')

with open('/data/Anypair/joeynmt/test/data_ky_ru/data_plus_morfessor_cyrillic_tr_and_ky/morfessed_test.ky','w') as f:
    for line in prep_test:
        f.write(line)
        f.write('\n')