## Download all the data we need for Wat zei je benchmarks

Training
- news-commentary-v13.de-en
- wmt13-commoncrawl.de-en
- wmt13-europarl.de-en

Test
- wmt14-newstest2014-de-en

Test OOD
- news-commentary-v14.nl-en

In [2]:
training_sets = [
    {
        "link": "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
        "subfolder": "",
        "de_file": "news-commentary-v13.de-en.en",
        "en_file": "news-commentary-v13.de-en.en"
    },
    {
        "link": "https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
        "subfolder": "",
        "de_file": "commoncrawl.de-en.de",
        "en_file": "commoncrawl.de-en.en"
    },
    {
        "link": "https://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
        "subfolder": "training/",
        "de_file": "europarl-v7.de-en.de",
        "en_file": "europarl-v7.de-en.en"
    }
]
test_set = {
    "link":  "https://www.statmt.org/wmt14/test.tgz",
    "subfolder": "test/",
    "de_file": "newstest2014-deen-src.de.sgm",
    "en_file": "newstest2014-deen-src.en.sgm"
}

test_ood = {
    "link": "https://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-nl.tsv.gz",
    "subfolder": "",
    "file_name": "news-commentary-v14.en-nl.tsv",
    "file_extension": ".tsv"
}

import requests
import tarfile
import gzip
import shutil
import os
data_directory = "./local/data"
if not os.path.exists(data_directory):
    os.makedirs(data_directory)


In [3]:
def download_and_extract(url, de_file=None, en_file=None):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    if local_filename.endswith(".tgz") or local_filename.endswith(".tar.gz"):
        with tarfile.open(local_filename, "r:gz") as tar:
            if de_file is None or en_file is None:
                tar.extractall(path=data_directory)
            else:
                for member in tar.getmembers():
                    if member.name.endswith(de_file) or member.name.endswith(en_file):
                        tar.extract(member, path=data_directory)

    if local_filename.endswith(".gz") and not local_filename.endswith(".tar.gz"):
        with gzip.open(local_filename, 'rb') as f_in:
            with open(os.path.join(data_directory, local_filename[:-3]), 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    os.remove(local_filename)

# Download and extract training sets
for dataset in training_sets:
    download_and_extract(dataset["link"], dataset["de_file"], dataset["en_file"])

# Download and extract test set
download_and_extract(test_set["link"], test_set["de_file"], test_set["en_file"])

# Download and extract ood test set
download_and_extract(test_ood["link"])

In [5]:
# Create directories for training, test, and test_ood data

training_dir = os.path.join(data_directory, "training")
test_dir = os.path.join(data_directory, "test")
test_ood_dir = os.path.join(data_directory, "test_ood")

os.makedirs(training_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_ood_dir, exist_ok=True)

# Move training data
for dataset in training_sets:
    de_file_path = os.path.join(data_directory, dataset["de_file"])
    en_file_path = os.path.join(data_directory, dataset["en_file"])
    if os.path.exists(de_file_path):
        shutil.move(de_file_path, training_dir)
    if os.path.exists(en_file_path):
        shutil.move(en_file_path, training_dir)

# Move test data
test_de_file_path = os.path.join(data_directory, test_set["de_file"])
test_en_file_path = os.path.join(data_directory, test_set["en_file"])
if os.path.exists(test_de_file_path):
    shutil.move(test_de_file_path, test_dir)
if os.path.exists(test_en_file_path):
    shutil.move(test_en_file_path, test_dir)

# Move test_ood data and convert from .tsv to .de and .en
test_ood_file_path = os.path.join(data_directory, test_ood["file_name"])
if os.path.exists(test_ood_file_path):
    en_lines = []
    nl_lines = []
    with open(test_ood_file_path, 'r') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            tab_split = line.split("\t")
            nl_lines.append(tab_split[-1])
            en_lines.append(" ".join(tab_split[:-1]) + "\n") # 32 lines have a tab in the english sentence that needs to be merged
                
            
    with open(os.path.join(test_ood_dir, test_ood["file_name"][:-len(test_ood["file_extension"])] + ".en"), "w") as de_f:
        de_f.writelines(en_lines)
    with open(os.path.join(test_ood_dir, test_ood["file_name"][:-len(test_ood["file_extension"])] + ".nl"), "w") as en_f:
        en_f.writelines(nl_lines)
    os.remove(test_ood_file_path)

# Remove empty directories
for root, dirs, files in os.walk(data_directory):
    if not dirs and not files:
        os.rmdir(root)