In [1]:
import requests
from argparse import Namespace
from pathlib import Path
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd

In [2]:
args = Namespace(
    url = "https://drive.google.com/uc?export=download&id=1o2ac0EliUod63sYUdpow_Dh-OqS3hF5Z",
    data_base_path = "../../data/mt/",
    data_filename = "eng_fra.txt",
    
    split_weights = {"train":0.7,"test":0.15,"val":0.15},
    seed = 1337
)

In [3]:
def download_and_save(args):
    with requests.get(args.url,stream=True) as r:
        r.raise_for_status()
        Path(args.data_base_path).mkdir(parents=True,exist_ok=True)
        with open((fname:=args.data_base_path+args.data_filename),"wb") as f:
            print(f"Saving the content in the file {fname}")
            for chunk in r.iter_content(chunk_size=100000):
                f.write(chunk)   

In [4]:
download_and_save(args)

Saving the content in the file ../../data/mt/eng_fra.txt


In [5]:
with open(args.data_base_path+args.data_filename) as f:
    lines = f.readlines()

lines = [line.replace("\n","").lower().split("\t") for line in lines]

In [6]:
lines[0]

['go.', 'va !']

In [7]:
data = []
for english_sentence, french_sentence in lines:
    data.append({"english_tokens": word_tokenize(english_sentence, language="english"),
                 "french_tokens": word_tokenize(french_sentence, language="french")})

In [8]:
data[0]

{'english_tokens': ['go', '.'], 'french_tokens': ['va', '!']}

In [9]:
filter_phrases = (
    ("i", "am"), ("i", "'m"), 
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")
)

In [10]:
data_subset = {phrase: [] for phrase in filter_phrases}
for datum in data:
    key = tuple(datum['english_tokens'][:2])
    if key in data_subset:
        data_subset[key].append(datum)


In [11]:
counts = {k: len(v) for k,v in data_subset.items()}
counts, sum(counts.values())

({('i', 'am'): 805,
  ('i', "'m"): 4760,
  ('he', 'is'): 1069,
  ('he', "'s"): 787,
  ('she', 'is'): 504,
  ('she', "'s"): 316,
  ('you', 'are'): 449,
  ('you', "'re"): 2474,
  ('we', 'are'): 181,
  ('we', "'re"): 1053,
  ('they', 'are'): 194,
  ('they', "'re"): 470},
 13062)

In [12]:
np.random.seed(args.seed)

dataset_stage3 = []
for phrase, datum_list in sorted(data_subset.items()):
    np.random.shuffle(datum_list)
    n_train = int(len(datum_list) * args.split_weights["train"])
    n_val = int(len(datum_list) * args.split_weights["val"])

    for datum in datum_list[:n_train]:
        datum['split'] = 'train'
        
    for datum in datum_list[n_train:n_train+n_val]:
        datum['split'] = 'val'
        
    for datum in datum_list[n_train+n_val:]:
        datum['split'] = 'test'
    
    dataset_stage3.extend(datum_list) 

In [13]:
for datum in dataset_stage3:
    datum['source_language'] = " ".join(datum.pop('english_tokens'))
    datum['target_language'] = " ".join(datum.pop('french_tokens'))

In [14]:
nmt_df = pd.DataFrame(dataset_stage3)

In [15]:
nmt_df.head()

Unnamed: 0,split,source_language,target_language
0,train,he 's the cutest boy in town .,c'est le garçon le plus mignon en ville .
1,train,he 's a nonsmoker .,il est non-fumeur .
2,train,he 's smarter than me .,il est plus intelligent que moi .
3,train,he 's a lovely young man .,c'est un adorable jeune homme .
4,train,he 's three years older than me .,il a trois ans de plus que moi .


In [19]:
for fname,_ in args.split_weights.items():
    df = nmt_df[nmt_df["split"]==fname][["source_language","target_language"]]
    df.to_csv(args.data_base_path+f"{fname}.csv",index=False)