In [None]:
! pip install transformers sacremoses datasets SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 48.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 33.2 MB/s 
[?25hCollecting SentencePiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 41.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 12.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.

In [None]:
import numpy as np
import os
from typing import List
import torch
import datasets
import pyarrow as pa
from datasets import concatenate_datasets, Dataset, DatasetDict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/MT_final_project')

Mounted at /content/drive


# Define functions for creating scrambled datasets

In [None]:
"""
  - ID: In-domain dataset
  - OOD: Out-of-domain datasets
  - datasetNames: List of dataset names to ensure identifiability
                Should be on the format: [ID_name, OOD[0]_name, ... OOD[N]_name]
  - ratios : List of floats representing the ratio of data from each OOD dataset to be included
"""

def buildDataset(ID: datasets.arrow_dataset.Dataset, OOD: List[datasets.arrow_dataset.Dataset], datasetNames: List[str], ratios: List[float]):

  ID = ID.add_column('origin', [datasetNames[0]]*len(ID))
  for i in range(len(OOD)):
    # add dataset names to OOD
    OOD[i] = OOD[i].add_column('origin', [datasetNames[i+1]]*len(OOD[i]))

  # if 100% of all OOD datasets should be used
  if sum(ratios) == len(ratios):
    L = [ID]
    L.extend(OOD)
    final_data = concatenate_datasets(L)
  # if splits of OOD dataset is specified
  else:
    OOD_data = []
    for idx, ratio in enumerate(ratios):
      D = OOD[idx].train_test_split(train_size=ratio)
      OOD_data.append(D['train'])
    
    L = [ID]
    L.extend(OOD_data)
    final_data = concatenate_datasets(L)

  return final_data


### Wrapper function for buildDataset


In [None]:
def set_custom_dataset(in_domain_name: str, out_domain_list: list, ratios: list):
  '''
  Helper function for quickly letting us build in and out of domain mixtures
  '''
  assert len(out_domain_list) == len(ratios)

  ted_path = os.path.join(os.getcwd(), 'data/TED2020')
  open_sub_path = os.path.join(os.getcwd(), 'data/OpenSubs')
  wiki_matrix_path = os.path.join(os.getcwd(), 'data/WikiMatrix')
  cc_aligned = os.path.join(os.getcwd(), 'data/CCAligned')

  if in_domain_name == 'Ted':
    InDom = datasets.load_from_disk(ted_path)
    print(InDom)
  else: # in_domain == 'Open_Sub'
    InDom = datasets.load_from_disk(open_sub_path)

  CC_Aligned = datasets.load_from_disk(cc_aligned)
  WikiMatrix = datasets.load_from_disk(wiki_matrix_path)

  In_dom_train = InDom['train']

  CC_Aligned_Train = CC_Aligned['train']
  WikiMatrix_Train = WikiMatrix['train']

  labels = [in_domain_name] + out_domain_list

  return buildDataset(In_dom_train, [WikiMatrix_Train, CC_Aligned_Train], labels, ratios=ratios)

# Data preprocessing functions
### pre-trained tokenizer used

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-da-en")

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [None]:
def removeNewLine(examples):
  vals=[]
  for e in examples:
    vals.append(e.replace('\n', ''))
  return vals

In [None]:
def preprocess(batch):
  src = tokenizer(batch['da'], 
                              padding='max_length', 
                              return_tensors='pt',
                              truncation=True,
                              max_length=180, 
                              return_attention_mask=True,
                              pad_to_max_length = True
                              )
  trg = tokenizer(batch['en'], 
                              padding='max_length', 
                              return_tensors='pt',
                              truncation=True,
                              max_length=180, 
                              return_attention_mask=True
                              )
  
  src['labels'] = trg['input_ids']
  #src['decoder_attention_mask'] = trg['attention_mask']
  return src

In [None]:
datadict = set_custom_dataset('Ted', ["WikiTrain", "CCTrain"], [.165,.0067])

DatasetDict({
    train: Dataset({
        features: ['da', 'en'],
        num_rows: 64901
    })
    test: Dataset({
        features: ['da', 'en'],
        num_rows: 3606
    })
    dev: Dataset({
        features: ['da', 'en'],
        num_rows: 3606
    })
})


In [None]:
print(datadict)

Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 151238
})


In [None]:
# remove new line
Datadict = datadict.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en']), 'origin': example['origin']}, batched=True)

  0%|          | 0/152 [00:00<?, ?ba/s]

In [None]:
# find longest source and target sentence
max_src = len(max(Datadict['da'], key=len).split(' '))
max_trg = len(max(Datadict['en'], key=len).split(' '))

In [None]:
print(Datadict['da'][0:2])

['Temperaturen starter ved 25 Celsius, og det går ned , det går ned. ', 'Og det var, at de mennesker der har en stærk følelse af kærlighed og af at høre til tror på at de er værdige til kærlighed og tilknytning. ']


In [None]:
# necessary such that input_ids and so on are tensors instead of list
Datadict = Datadict.with_format('torch')

In [None]:
TokenizedData = Datadict.map(preprocess, batched=True)

  0%|          | 0/152 [00:00<?, ?ba/s]

In [None]:
# sanity checks
print(len(TokenizedData['input_ids'][0]))
print(type(TokenizedData['input_ids'][1500]))
print(TokenizedData):

180


In [None]:
#Specify the name of the dataset to load and mix
#datadict = set_custom_dataset('Ted', ["WikiTrain", "CCTrain"], [1,1])
save_path = os.path.join(os.getcwd() + '/data/TrainMixSmall')
TokenizedData.save_to_disk(save_path)

# Create Catastrophic forgetting dataset

In [None]:
wiki_matrix_path = os.path.join(os.getcwd(), 'data/WikiMatrix')
cc_aligned = os.path.join(os.getcwd(), 'data/CCAligned')
CC_Aligned = datasets.load_from_disk(cc_aligned)
WikiMatrix = datasets.load_from_disk(wiki_matrix_path)
CC_Aligned_test = CC_Aligned['dev']
WikiMatrix_dev = WikiMatrix['test']


In [None]:
CC_Aligned_test

Dataset({
    features: ['da', 'en'],
    num_rows: 2147722
})

In [None]:
WikiMatrix_dev

Dataset({
    features: ['da', 'en'],
    num_rows: 87211
})

In [None]:
cc_ratio = 0.0013996224837293
wm_ratio = 0.0344681290204217


cca = CC_Aligned_test.train_test_split(train_size=cc_ratio)['train']
wm = WikiMatrix_dev.train_test_split(train_size=wm_ratio)['train']

In [None]:
wm

Dataset({
    features: ['da', 'en'],
    num_rows: 3005
})

In [None]:
CF_data = concatenate_datasets([cca, wm])

In [None]:
CF_data_pp = CF_data.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en'])}, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
CF_data_pp = CF_data_pp.with_format('torch')
CF_data_tokenized = CF_data_pp.map(preprocess, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
CF_data_tokenized

Dataset({
    features: ['da', 'en', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6011
})

In [None]:
CF_save_path = os.path.join(os.getcwd(), 'data/CataForgTestDataTokenized')

In [None]:
CF_data_tokenized.save_to_disk(CF_save_path)

# Create tokenized dev set for TED

In [None]:
ted_path = os.path.join(os.getcwd(),'data/TED2020')
TED = datasets.load_from_disk(ted_path)
TEDDev = TED['dev']

In [None]:
TEDDev

Dataset({
    features: ['da', 'en'],
    num_rows: 3606
})

In [None]:
TEDDev = TEDDev.map(lambda example: {'da' : removeNewLine(example['da']), 'en' : removeNewLine(example['en'])}, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
TEDDev = TEDDev.map(preprocess, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
TEDDev_save_path = os.path.join(os.getcwd(),'data/TEDDevTokenized')
TEDDev.save_to_disk(TEDDev_save_path)

# Create tokenized test set for TED

In [None]:
ted_path = os.path.join(os.getcwd(),'data/TED2020')
TED = datasets.load_from_disk(ted_path)
TEDTest = TED['test']

In [None]:
TEDTest

Dataset({
    features: ['da', 'en'],
    num_rows: 3606
})

In [None]:
TEDTest = TEDTest.map(lambda example: {'da' : removeNewLine(example['da']), 'en' : removeNewLine(example['en'])}, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
TEDTest = TEDTest.map(preprocess, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
TEDTest_save_path = os.path.join(os.getcwd(),'data/TEDTestTokenized')
TEDTest.save_to_disk(TEDDev_save_path)

# Testing buildDataset function

In [None]:
ted_path = os.path.join(os.getcwd(), 'data/TED2020')
wiki_matrix_path = os.path.join(os.getcwd(), 'data/WikiMatrix')
cc_aligned_path = os.path.join(os.getcwd(), 'data/CCAligned')

TED = datasets.load_from_disk(ted_path)
WikiMatrix = datasets.load_from_disk(wiki_matrix_path)
CCAligned = datasets.load_from_disk(cc_aligned_path)
TEDdev = TED['dev']
WMdev = WikiMatrix['dev']
WMtest = WikiMatrix['test']

In [None]:
print(len(CCAligned['train'])*.0067)

43169.2122


In [None]:
print(len(TED['train']))

43267


In [None]:
print(len(TED['train'])/len(WikiMatrix['train']))

0.16537476589076175


In [None]:
print(len(TED['train'])/len(CCAligned['train']))

0.006715176979764296


In [None]:
buildDataset(TEDdev, [WMdev, WMtest], ["TED", "WMdev", "WMtest"], ratios=[1,1])

Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 188845
})

In [None]:
data_ratios = buildDataset(TEDdev, [WMdev, WMtest], ["TED", "WMdev", "WMtest"], ratios=[0.5,0.9])
print(data_ratios)


Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 136517
})


In [None]:
print("Ratio data:")
print(f'total length: {len(data_ratios)}')
print("--------------"*3)
print(f"individual lengths adjusted with ratio: \n TED: {len(TEDdev)} \n WMdev: {len(WMdev)*0.5} \n WMtest: {len(WMtest)*0.9} ")

print("total:", len(TEDdev) + len(WMdev)*0.5 + len(WMtest)*0.9)

Ratio data:
total length: 136517
------------------------------------------
individual lengths adjusted with ratio: 
 TED: 14423 
 WMdev: 43605.5 
 WMtest: 78489.90000000001 
total: 136518.40000000002
