In [1]:
from argparse import Namespace
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
args = Namespace(
    source_data_path="../data/nmt/eng-rus.txt",
    output_data_path="../data/nmt/eng_rus.csv",
    perc_train=0.7,
    perc_val=0.15,
    perc_test=0.15,
    seed=1337
)

assert args.perc_test > 0 and (args.perc_test + args.perc_val + args.perc_train == 1.0)

In [20]:
with open(args.source_data_path, encoding='utf-8') as fp:
    lines = fp.readlines()
    
lines = [line.replace("\n", "").lower().split("\t") for line in lines]

In [21]:
data = []
for english_sentence, russian_sentence, _ in lines:
    data.append({"english_tokens": word_tokenize(english_sentence, language="english"),
                 "russian_tokens": word_tokenize(russian_sentence, language="russian")})

In [22]:
filter_phrases = (
    ("i", "am"), ("i", "'m"), 
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")
)


In [23]:
filter = False

if filter:
    data_subset = {phrase: [] for phrase in filter_phrases}
    for datum in data:
        key = tuple(datum['english_tokens'][:2])
        if key in data_subset:
            data_subset[key].append(datum)
else:
    data_subset = {}
    for datum in data:
        key = tuple(datum['english_tokens'][:2])
        if key in data_subset:
            data_subset[key].append(datum)
        else:
            data_subset[key] = [datum]

In [24]:
counts = {k: len(v) for k,v in data_subset.items()}
counts, sum(counts.values())

tom', 'ran'): 116,
  ('tom', 'won'): 44,
  ('wake', 'up'): 16,
  ('wash', 'up'): 1,
  ('we', 'care'): 2,
  ('we', 'know'): 309,
  ('we', 'lost'): 33,
  ('who', 'ate'): 13,
  ('who', 'ran'): 3,
  ('who', 'won'): 8,
  ('why', 'not'): 42,
  ('you', 'run'): 4,
  ('you', 'won'): 10,
  ('am', 'i'): 203,
  ('ask', 'them'): 13,
  ('back', 'off'): 11,
  ('be', 'a'): 9,
  ('be', 'brave'): 2,
  ('be', 'brief'): 4,
  ('be', 'quiet'): 17,
  ('beats', 'me'): 1,
  ('buzz', 'off'): 1,
  ('bye', ','): 1,
  ('call', 'tom'): 26,
  ('catch', 'me'): 5,
  ('cheer', 'up'): 4,
  ('cool', 'off'): 1,
  ('cuff', 'him'): 2,
  ('do', "n't"): 5882,
  ('drive', 'on'): 2,
  ('find', 'tom'): 15,
  ('fix', 'this'): 4,
  ('get', 'away'): 13,
  ('get', 'busy'): 4,
  ('get', 'down'): 22,
  ('get', 'lost'): 18,
  ('get', 'real'): 2,
  ('go', 'ahead'): 26,
  ('good', 'job'): 2,
  ('grab', 'tom'): 2,
  ('grab', 'him'): 3,
  ('have', 'fun'): 8,
  ('he', 'spoke'): 16,
  ('he', 'tried'): 59,
  ('he', "'s"): 950,
  ('help', 'tom

In [25]:
np.random.seed(args.seed)

dataset_stage3 = []
for phrase, datum_list in sorted(data_subset.items()):
    np.random.shuffle(datum_list)
    n_train = int(len(datum_list) * args.perc_train)
    n_val = int(len(datum_list) * args.perc_val)

    for datum in datum_list[:n_train]:
        datum['split'] = 'train'
        
    for datum in datum_list[n_train:n_train+n_val]:
        datum['split'] = 'val'
        
    for datum in datum_list[n_train+n_val:]:
        datum['split'] = 'test'
    
    dataset_stage3.extend(datum_list)    

print(dataset_stage3[0])

{'english_tokens': ['100', 'years', 'is', 'called', 'a', 'century', '.'], 'russian_tokens': ['100', 'лет', 'называются', 'веком', '.'], 'split': 'train'}


In [26]:
# here we pop and assign into the dictionary, thus modifying in place
for datum in dataset_stage3:
    datum['source_language'] = " ".join(datum.pop('english_tokens'))
    datum['target_language'] = " ".join(datum.pop('russian_tokens'))

In [27]:
nmt_df = pd.DataFrame(dataset_stage3)

In [28]:
nmt_df.head()

Unnamed: 0,split,source_language,target_language
0,train,100 years is called a century .,100 лет называются веком .
1,test,100 years is called a century .,сто лет называют веком .
2,train,2013 is a year i 'll never forget .,"две тысячи тринадцатый - год , который я никог..."
3,test,2013 is a year that i 'll never forget .,"две тысячи тринадцатый - год , который я никог..."
4,test,2539 is a prime number .,2539 — простое число .


In [29]:
nmt_df.to_csv(args.output_data_path)