In [21]:
import torchtext
import torchdata
import torch
from gensim.models import Word2Vec
import pandas as pd
import re
import fasttext
import numpy as np
import pickle
import os

In [3]:
tags = ['<SOS>', '<EOS>', '<PAD>', '<UNK>', '<NAN>', '<NUM>', '<TIME>', '<ENUM>', '<DATE>', '<PHONE>', '<EMAIL>', '<DOTS>', '<SHORT>', '<NAME>']


In [4]:
def bad_patterns_to_tags_replaser(text: str):
    text = re.sub(r'\d+\:\d+\:\d+', ' <TIME> ', text)
    text = re.sub(r'\d+\:\d+', ' <TIME> ', text)
    text = re.sub(r'\+{,1}\d{1,3}\({,1}[\-\s]{,1}\d{3}\){,1}[\-\s]{,1}\d{3}[\-\s]{,1}\d{2}[\-\s]{,1}\d{2}', ' <PHONE> ', text)
    text = re.sub('\d+/\d+/\d+', ' <DATE> ', text)
    text = re.sub('\d+-\d+-\d+', ' <DATE> ', text)
    text = re.sub('\d+th', ' <ENUM> ', text)
    text = re.sub('\d+rd', ' <ENUM> ', text)
    text = re.sub('\d+st', ' <ENUM> ', text)
    text = re.sub('[\+\-]?\d+.\d+', ' <NUM> ', text)
    text = re.sub('[\+\-]?\d+,\d+', ' <NUM> ', text)
    text = re.sub('\d+', ' <NUM> ', text)
    # text = re.sub(r'\w+\.', '<SHORT>', text[0:-1]) + '.'
    text = re.sub(',', ' , ', text)
    text = re.sub(';', ' ; ', text)
    text = re.sub(';', ' ; ', text)
    text = re.sub('-', ' - ', text)
    text = re.sub(':', ' : ', text)
    text = re.sub('\?', ' \? ', text)
    text = re.sub('\...', ' <DOTS> ', text)
    text = re.sub('"', ' " ', text)
    text = re.sub("'s", " 's ", text)
    text = re.sub("'d", " 'd ", text)
    text = re.sub("'re", " 're ", text)
    text = re.sub("'m", " 'm ", text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('\.', ' . ', text)
    return text[0:-1]


In [5]:
bad_patterns_to_tags_replaser(
    "I'd wake up at 7:30 or 100:02:20, go... downstairs, and the 123,45%, 123.45%, +123,45%, -123,45%, +123.45%, -123.45%, front phones [+7 999 444 55 66, +7-999-444-55-66, +79994445566, 79994445566, +7(999)-444-55-66, +7(999)-444-5566] door would be open - 600 beers in the kitchen and 20th, 3rd, 1st living room and nobody in the house on 12/12/2012 or 12-12-2012.".lower()
)


"i 'd wake up at <TIME> or <TIME> , go <DOTS> downstairs , and the <NUM> % , <NUM> % , <NUM> % , <NUM> % , <NUM> % , <NUM> % , front phones [ <PHONE> , <PHONE> , <PHONE> , <PHONE> , <PHONE> , <PHONE> ] door would be open - <NUM> beers in the kitchen and <ENUM> , <ENUM> , <ENUM> living room and nobody in the house on <DATE> or <DATE>  ."

In [6]:
def print_simple_data_pipe(date_pipe: torchdata.datapipes.iter.IterDataPipe, n=10) -> None:
  print(type(date_pipe))
  x = 0
  for sample in date_pipe:
    print(sample)
    if x == n:
      break
    x +=1


# Create Dataset

In [7]:
with open('../Data/1mcorpus/corpus.en_ru.1m.en', encoding='utf-8') as f:
    eng = f.read().split('\n')
with open('../Data/1mcorpus/corpus.en_ru.1m.ru', encoding='utf-8') as f:
    rus = f.read().split('\n')


In [8]:
pd.DataFrame({'eng': eng, 'rus': rus}).to_csv('../Data/1mcorpus/data.csv', index=False)

In [9]:
max_len = len(bad_patterns_to_tags_replaser(max(eng + rus, key=len)).split(' '))


# Load Dataset using torchdata

In [10]:
eng_rus_pairs = torchdata.datapipes.iter.IterableWrapper(['../Data/1mcorpus/data.csv'])

In [11]:
eng_rus_pairs_pipe = torchdata.datapipes.iter.FileOpener(eng_rus_pairs, mode='r', encoding='utf-8', )

In [16]:
eng_rus_pairs_pipe_parsed = eng_rus_pairs_pipe.parse_csv(skip_lines=1, delimiter=',')
print_simple_data_pipe(eng_rus_pairs_pipe_parsed, 3)


<class 'torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe'>
["This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots.", 'Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.']
['A nondisclosure clause in the final settlement (the band is back on Elektra) prevents Ulrich, an irrepressible motormouth, from providing any juicy contractual details.', 'Решение суда (группа вернулась под крыло к Elektra Entertainment) предотвратило дальнейшие нападки со стороны неугомонного Ульриха и не позволило ему обнародовать детали нового контракта.']
["When you're 18 or 19 years old, you have that gang mentality in your band.", 'Когда тебе 

# Make Vocab

In [12]:
def tokenize(text: str) -> list[str]:
  return [t for t in bad_patterns_to_tags_replaser(text.lower()).split()]


In [13]:
def yield_tokens_eng(data_iter: torchdata.datapipes.iter.IterDataPipe):
  for eng, rus in data_iter:
    yield tokenize(eng)


In [14]:
def yield_tokens_rus(data_iter: torchdata.datapipes.iter.IterDataPipe):
  for eng, rus in data_iter:
    yield tokenize(rus)


In [17]:
print_simple_data_pipe(yield_tokens_eng(eng_rus_pairs_pipe_parsed), 3)


<class 'generator'>
['this', 'new', 'development', 'in', 'harry', "'s", 'character', 'may', 'be', 'a', 'disappointment', 'to', 'those', 'readers', 'who', 'enjoyed', 'his', 'old', 'vindictive', 'ways', ',', 'but', 'it', 'also', 'reinforces', 'the', 'position', 'of', 'pro', '-', 'potter', 'people', 'who', 'do', 'not', 'see', 'beneath', 'the', 'surface', 'appearance', 'of', 'the', 'characters', 'and', 'plots', '.']
['a', 'nondisclosure', 'clause', 'in', 'the', 'final', 'settlement', '(the', 'band', 'is', 'back', 'on', 'elektra)', 'prevents', 'ulrich', ',', 'an', 'irrepressible', 'motormouth', ',', 'from', 'providing', 'any', 'juicy', 'contractual', 'details', '.']
['when', 'you', "'re", '<NUM>', 'or', '<NUM>', 'years', 'old', ',', 'you', 'have', 'that', 'gang', 'mentality', 'in', 'your', 'band', '.']
['now', 'you', 'have', 'black', 'sabbath', 'and', 'kiss', 'tribute', 'albums', '.']


In [18]:
print_simple_data_pipe(yield_tokens_rus(eng_rus_pairs_pipe_parsed), 3)


<class 'generator'>
['такое', 'развитие', 'характера', 'гарри', 'может', 'разочаровать', 'читателей', ',', 'полюбивших', 'его', 'былую', 'мстительность', ',', 'но', 'с', 'другой', 'стороны', 'это', 'преображение', 'укрепляет', 'позицию', 'тех', ',', 'кто', 'не', 'видит', 'глубже', 'сюжета', 'и', 'изображения', 'героев', '.']
['решение', 'суда', '(группа', 'вернулась', 'под', 'крыло', 'к', 'elektra', 'entertainment)', 'предотвратило', 'дальнейшие', 'нападки', 'со', 'стороны', 'неугомонного', 'ульриха', 'и', 'не', 'позволило', 'ему', 'обнародовать', 'детали', 'нового', 'контракта', '.']
['когда', 'тебе', '<NUM>', 'или', '<NUM>', 'лет', ',', 'легко', 'перенимать', 'бандитские', 'повадки', 'и', 'переносить', 'их', 'в', 'группу', '.']
['а', 'сейчас', 'куча', 'триьютов', 'тем', 'же', 'самым', 'black', 'sabbath', 'и', 'kiss', '.']


In [19]:
eng_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens_eng(eng_rus_pairs_pipe_parsed),
    min_freq=2,
    specials=tags,
    special_first=True
)
eng_vocab.set_default_index(eng_vocab['<UNK>'])


In [20]:
rus_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens_rus(eng_rus_pairs_pipe_parsed),
    min_freq=2,
    specials=tags,
    special_first=True
)
rus_vocab.set_default_index(rus_vocab['<UNK>'])


In [23]:
if not os.path.isfile('./eng_vocab.pkl'):
    with open('./eng_vocab.pkl', 'wb') as f:
        pickle.dump(eng_vocab, f)
else:
    print('already exist')


already exist


In [24]:
if not os.path.isfile('./rus_vocab.pkl'):
    with open('./rus_vocab.pkl', 'wb') as f:
        pickle.dump(rus_vocab, f)
else:
    print('already exist')


# W2V

In [25]:
w2v_rus = Word2Vec(
    sentences=list(yield_tokens_rus(eng_rus_pairs_pipe_parsed)),
    vector_size=128,
    min_count=1, 
    window=5, 
    workers=4, 
    epochs=10, 
    compute_loss=True
)


In [26]:
w2v_eng = Word2Vec(
    sentences=list(yield_tokens_eng(eng_rus_pairs_pipe_parsed)),
    vector_size=128,
    min_count=1,
    window=5,
    workers=4,
    epochs=10,
    compute_loss=True
)


In [27]:
if not os.path.isfile('./w2v_eng.model'):
    w2v_eng.save('./w2v_eng.model')
else:
    print('already exist')


In [28]:
if not os.path.isfile('./w2v_rus.model'):
    w2v_eng.save('./w2v_rus.model')
else:
    print('already exist')


# Preprocessed

In [301]:
def vocab_transform(vocab: torchtext.vocab.Vocab) -> torchtext.transforms.Sequential:
    text_tranform = torchtext.transforms.Sequential(
        torchtext.transforms.VocabTransform(vocab=vocab),
        torchtext.transforms.AddToken(vocab['<SOS>'], begin=True),
        torchtext.transforms.AddToken(vocab['<EOS>'], begin=False)
    )
    return text_tranform


In [313]:
def apply_vocab_transform(pair):
    return (
        vocab_transform(eng_vocab)(tokenize(pair[0])),
        vocab_transform(rus_vocab)(tokenize(pair[1]))
    )


In [328]:
def apply_vocab_vectorise(pair):
    eng = []
    rus = []
    for i in pair[0]:
        if i < len(tags):
            eng.append(np.ones(w2v_eng.vector_size) * i / len(tags))
        else:
            eng.append(w2v_eng.wv[i].tolist())
    for i in pair[1]:
        if i < len(tags):
            rus.append(np.ones(w2v_rus.vector_size) * i / len(tags))
        else:
            rus.append(w2v_rus.wv[i].tolist())
    return (torch.tensor(eng), torch.tensor(rus))


In [314]:
eng_rus_pairs_pipe_transformed = eng_rus_pairs_pipe_parsed.map(apply_vocab_transform)
print_simple_data_pipe(eng_rus_pairs_pipe_transformed, 3)


<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>
([0, 33, 67, 110, 18, 4588, 38, 1357, 85, 30, 19, 9970, 17, 127, 3579, 68, 3764, 58, 302, 46414, 799, 13, 51, 27, 64, 16778, 12, 564, 15, 2069, 20, 10235, 77, 68, 88, 37, 161, 6151, 12, 1471, 2332, 15, 12, 2435, 16, 7153, 14, 1], [0, 459, 370, 1559, 4596, 52, 75283, 6306, 12, 290035, 37, 70130, 276120, 12, 35, 19, 248, 158, 29, 71785, 13062, 2235, 170, 12, 123, 20, 3941, 8887, 26763, 15, 1266, 8854, 13, 1])
([0, 19, 57752, 5232, 18, 12, 941, 2080, 914, 1592, 21, 231, 25, 3, 5438, 24176, 13, 44, 35133, 124305, 13, 35, 899, 79, 20172, 6594, 1093, 14, 1], [0, 264, 917, 31834, 11454, 104, 17778, 24, 136993, 156241, 75070, 7429, 31135, 83, 158, 3, 121214, 15, 20, 3901, 188, 35976, 2900, 450, 4622, 13, 1])
([0, 66, 32, 469, 5, 34, 5, 111, 302, 13, 32, 41, 23, 7985, 7725, 18, 56, 1592, 14, 1], [0, 57, 1203, 5, 28, 5, 99, 12, 593, 71555, 138968, 100289, 15, 16696, 46, 14, 1428, 13, 1])
([0, 122, 32, 41, 845, 8916, 16, 8366, 

In [333]:
eng_rus_pairs_pipe_vectorized = eng_rus_pairs_pipe_transformed.map(apply_vocab_vectorise)
x = 0
for i in eng_rus_pairs_pipe_vectorized:
    print(i[0].shape, i[1].shape)
    if x == 3:
        break
    x += 1


torch.Size([48, 128]) torch.Size([34, 128])
torch.Size([29, 128]) torch.Size([27, 128])
torch.Size([20, 128]) torch.Size([19, 128])
torch.Size([12, 128]) torch.Size([14, 128])


In [346]:
eng_rus_pairs_pipe_batch = eng_rus_pairs_pipe_transformed.bucketbatch(
    batch_size=256,
    use_in_batch_shuffle=False
)


In [347]:
def separate(pair):
    examples, targets = zip(*pair)
    return examples, targets


In [354]:
def apply_padding(pair):
    print(len(pair[0]))
    return (torchtext.transforms.ToTensor(eng_vocab['<PAD>'])([list(pair[0])]), torchtext.transforms.ToTensor(eng_vocab['<PAD>'])([list(pair[1])]))

In [355]:
eng_rus_pairs_pipe_eqlength = eng_rus_pairs_pipe_transformed.map(apply_padding)
x = 0
for i in eng_rus_pairs_pipe_eqlength:
    print(i[0].shape, i[1].shape)
    print(i)
    if x == 3:
        break
    x += 1


48
torch.Size([1, 48]) torch.Size([1, 34])
(tensor([[    0,    33,    67,   110,    18,  4588,    38,  1357,    85,    30,
            19,  9970,    17,   127,  3579,    68,  3764,    58,   302, 46414,
           799,    13,    51,    27,    64, 16778,    12,   564,    15,  2069,
            20, 10235,    77,    68,    88,    37,   161,  6151,    12,  1471,
          2332,    15,    12,  2435,    16,  7153,    14,     1]]), tensor([[     0,    459,    370,   1559,   4596,     52,  75283,   6306,     12,
         290035,     37,  70130, 276120,     12,     35,     19,    248,    158,
             29,  71785,  13062,   2235,    170,     12,    123,     20,   3941,
           8887,  26763,     15,   1266,   8854,     13,      1]]))
29
torch.Size([1, 29]) torch.Size([1, 27])
(tensor([[     0,     19,  57752,   5232,     18,     12,    941,   2080,    914,
           1592,     21,    231,     25,      3,   5438,  24176,     13,     44,
          35133, 124305,     13,     35,    899,     79

# Bad Try

In [94]:
eng = torchdata.datapipes.iter.IterableWrapper(['../Data/1mcorpus/corpus.en_ru.1m.en'])
rus = torchdata.datapipes.iter.IterableWrapper(['../Data/1mcorpus/corpus.en_ru.1m.ru'])


In [95]:
eng_pipe = torchdata.datapipes.iter.FileOpener(eng, mode='r', encoding='utf-8', )
rus_pipe = torchdata.datapipes.iter.FileOpener(rus, mode='r', encoding='utf-8', )

In [96]:
eng_pipe_parsed = eng_pipe.parse_csv(skip_lines=0, delimiter='\n')
print_simple_data_pipe(eng_pipe_parsed, 3)
rus_pipe_parsed = rus_pipe.parse_csv(skip_lines=0, delimiter='\n')
print_simple_data_pipe(rus_pipe_parsed, 3)



<class 'torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe'>
["This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots."]
['A nondisclosure clause in the final settlement (the band is back on Elektra) prevents Ulrich, an irrepressible motormouth, from providing any juicy contractual details.']
["When you're 18 or 19 years old, you have that gang mentality in your band."]
['Now you have Black Sabbath and Kiss tribute albums.']
<class 'torchdata.datapipes.iter.util.plain_text_reader.CSVParserIterDataPipe'>
['Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.']
['Решение суда (группа вернулась под крыло к Elektra Entertainment

In [100]:
def tokenize(text: str) -> list[str]:
  return [bad_patterns_to_tags_replaser(t.lower()) for t in text.split()]


In [101]:
def yield_tokens(data_iter: torchdata.datapipes.iter.IterDataPipe):
  for example in data_iter:
    yield tokenize(example[0])


In [107]:
eng_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(eng_pipe_parsed),
    min_freq=2,
    specials=tags,
    special_first=True
)
eng_vocab.set_default_index(eng_vocab['<UNK>'])


In [108]:
rus_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(rus_pipe_parsed),
    min_freq=2,
    specials=tags,
    special_first=True
)
rus_vocab.set_default_index(rus_vocab['<UNK>'])


In [110]:
def vocab_transform(vocab: torchtext.vocab.Vocab) -> torchtext.transforms.Sequential:
    text_tranform = torchtext.transforms.Sequential(
        torchtext.transforms.VocabTransform(vocab=vocab),
        torchtext.transforms.AddToken(vocab['<SOS>'], begin=True),
        torchtext.transforms.AddToken(vocab['<EOS>'], begin=False)
    )
    return text_tranform


In [117]:
def apply_vocab_transform_eng(text: list[str]):
    return vocab_transform(eng_vocab)(tokenize(text[0]))


In [119]:
eng_pipe_mod = eng_pipe_parsed.map(apply_vocab_transform_eng)
