In [1]:
! pip install transformers sacremoses SentencePiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 41.1 MB/s 
[?25hCollecting SentencePiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 51.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 49.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 55.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux201

## ...

#### Dataset resources: 
* OpenSubtitles: 
* TED2020
* CommonCrawl Aligned
* WikiMatrix


In [2]:
import os 
import numpy as np

from datasets import Dataset, DatasetDict
import pyarrow as pa
import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/MT_final_project')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Mounted at /content/drive


In [4]:
from typing import List

"""
#### reformatData ####
  - In: list of danish and english sentences
  - Out: datasets.DatasetDict with train, test and dev

    split: 60/20/20 if out of domain
    split 90/5/5 if in domain

"""
def reformatData(da: List[str], en: List[str], dev=False, use_in_dom_split=False, device=device):
  pa_da = pa.array(da_list)
  pa_en = pa.array(en_list)
  pa_tab = pa.Table.from_arrays([pa_da, pa_en], names=['da', 'en'])
  Data = Dataset(pa_tab) #.with_format('torch', device=device)

  TEST_SIZE, VAL_SIZE = 0.2, 0.25
  if use_in_dom_split:
    TEST_SIZE, VAL_SIZE = 0.05, 0.05263157894 #Technically 5%


  if dev == True:
    # split data --> 60-20-20 split if our
    train_test = Data.train_test_split(test_size=TEST_SIZE)
    train_dev = train_test['train'].train_test_split(test_size=VAL_SIZE) # 0.95*x = 0.05, x = 0.05263157894
    dataset_dict = DatasetDict({
        'train': train_dev['train'],
        'test': train_test['test'],
        'dev': train_dev['test']})
  else:
    train_test = Data.train_test_split(test_size=TEST_SIZE)
    dataset_dict = DatasetDict({
        'train': train_test['train'],
        'test': train_test['test']})
  return dataset_dict
  


# Open Subtitles

In [None]:
da_path = os.path.join(os.getcwd() + '/data/OpenSubtitles.da-en.da')
en_path = os.path.join(os.getcwd() + '/data/OpenSubtitles.da-en.en')

# load data
with open(da_path, encoding='utf-8') as data:
  da_list = data.readlines()

with open(en_path, encoding='utf-8') as data:
  en_list = data.readlines()

da_avg = np.mean([len(e.split(' ')) for e in da_list])
en_avg = np.mean([len(e.split(' ')) for e in en_list])
print(f'Average danish sentence length:{da_avg} ')
print(f'Average danish sentence length:{en_avg} ')

Average danish sentence length:5.881326138277416 
Average danish sentence length:6.6519567525637555 


# WikiMatrix

In [None]:
da_path = '/content/WikiMatrix.da-en.da'
en_path = '/content/WikiMatrix.da-en.en'


with open(da_path, encoding='utf-8') as data:
  da_list = data.readlines()

with open(en_path, encoding='utf-8') as data:
  en_list = data.readlines()

# Ted 2020

In [5]:
da_path = '/content/TED2020.da-en.da'
en_path = '/content/TED2020.da-en.en'

# load danish sentences
with open(da_path, encoding='utf-8') as data:
  da_list = data.readlines()
# load english data
with open(en_path, encoding='utf-8') as data:
  en_list = data.readlines()

datadict = reformatData(da_list, en_list, dev=True, use_in_dom_split=True)

save_path = os.path.join(os.getcwd() + '/data/TED2020')
datadict.save_to_disk(save_path)

Flattening the indices:   0%|          | 0/65 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/4 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/4 [00:00<?, ?ba/s]

# CommonCrawl aligned

In [None]:
da_path = '/content/CCAligned.da-en.da'
en_path = '/content/CCAligned.da-en.en'

#with open(da_path, encoding='utf-8') as data:
#  da_list = data.readlines()

with open(en_path, encoding='utf-8') as data:
  en_list = data.readlines()

In [None]:
assert len(da_list) == len(en_list)

In [None]:
datadict = reformatData(da_list, en_list, dev=True)

save_path = os.path.join(os.getcwd() + '/data/OpenSubs')



In [None]:
datadict.save_to_disk(save_path)

Flattening the indices:   0%|          | 0/8685 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/2895 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/2895 [00:00<?, ?ba/s]