In [None]:
! pip install transformers sacremoses datasets SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 24.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 63.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 82.6 MB/s 
[?25hCollecting SentencePiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 15.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 42.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux20

In [None]:
import numpy as np
import os
from typing import List
import torch
import datasets
import pyarrow as pa
from datasets import concatenate_datasets, Dataset, DatasetDict

In [None]:
from google.colab import drive
drive.mount('/content/drive')


os.chdir('/content/drive/MyDrive/MT_final_project')

Mounted at /content/drive


# Define functions for creating scrambled datasets

In [None]:
"""
  - ID: In-domain dataset
  - OOD: Out-of-domain datasets
  - datasetNames: List of dataset names to ensure identifiability
                Should be on the format: [ID_name, OOD[0]_name, ... OOD[N]_name]
  - ratios : List of floats representing the ratio of data from each OOD dataset to be included
"""

def buildDataset(ID: datasets.arrow_dataset.Dataset, OOD: List[datasets.arrow_dataset.Dataset], datasetNames: List[str], ratios: List[float]):

  ID = ID.add_column('origin', [datasetNames[0]]*len(ID))
  for i in range(len(OOD)):
    # add dataset names to OOD
    OOD[i] = OOD[i].add_column('origin', [datasetNames[i+1]]*len(OOD[i]))

  # if 100% of all OOD datasets should be used
  if sum(ratios) == len(ratios):
    L = [ID]
    L.extend(OOD)
    final_data = concatenate_datasets(L)
  # if splits of OOD dataset is specified
  else:
    OOD_data = []
    for idx, ratio in enumerate(ratios):
      if ratio == 1:
        OOD_data.append(OOD[idx])
      else:
        D = OOD[idx].train_test_split(train_size=ratio)
        OOD_data.append(D['train'])        
    
    L = [ID]
    L.extend(OOD_data)
    final_data = concatenate_datasets(L)

  return final_data


### Wrapper function for buildDataset


In [None]:
from copy import deepcopy

def set_custom_dataset(in_domain_name: str, out_domain_list: list, ratios: list, indom_over_sample = 0):
  '''
  Helper function for quickly letting us build in and out of domain mixtures
  '''
  assert len(out_domain_list) == len(ratios)

  ted_path = os.path.join(os.getcwd(), 'data/TED2020')
  ted_prepended_path = os.path.join(os.getcwd(), 'data/TED_prepended')
  open_sub_path = os.path.join(os.getcwd(), 'data/OpenSubs')
  wiki_matrix_path = os.path.join(os.getcwd(), 'data/WikiMatrix')
  cc_aligned = os.path.join(os.getcwd(), 'data/CCAligned')

  if in_domain_name == 'Ted':
    InDom = datasets.load_from_disk(ted_path)
  elif in_domain_name == 'Ted_prepended':
    InDom = datasets.load_from_disk(ted_prepended_path)
  else: # in_domain == 'Open_Sub'
    InDom = datasets.load_from_disk(open_sub_path)

  CC_Aligned = datasets.load_from_disk(cc_aligned)
  WikiMatrix = datasets.load_from_disk(wiki_matrix_path)

  if in_domain_name == 'Ted_prepended':
    In_dom_train = InDom['train'].remove_columns(['input_ids', 'attention_mask', 'labels'])
  else:
    In_dom_train = InDom['train']

  # Support up sampling
  in_dom_over_samp = []
  ratios_over_samp = []
  in_dom_over_samp_names = []
  if indom_over_sample > 0:
    in_dom_over_samp = [deepcopy(In_dom_train) for _ in range(indom_over_sample)]
    ratios_over_samp = [1 for _ in range(indom_over_sample)]
    in_dom_over_samp_names = ["in_doms_over_sample_%s" % (i + 1) for i in range(indom_over_sample)]


  CC_Aligned_Train = CC_Aligned['train']
  WikiMatrix_Train = WikiMatrix['train']
  WikiMatrix_Dev = WikiMatrix['dev']

  ood_datasets = [WikiMatrix_Train, WikiMatrix_Dev, CC_Aligned_Train]
  other_data_sets = ood_datasets + in_dom_over_samp


  labels = [in_domain_name] + out_domain_list + in_dom_over_samp_names
  ratios_final = ratios + ratios_over_samp

  print(other_data_sets, ratios_final)
  return buildDataset(In_dom_train, other_data_sets, labels, ratios=ratios_final)

# Data preprocessing functions
### pre-trained tokenizer used

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-da-en")
special_tokens_dict = {'additional_special_tokens': ['[CONV]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [None]:
def removeNewLine(examples):
  vals=[]
  for e in examples:
    vals.append(e.replace('\n', ''))
  return vals

In [None]:
def add_domain_token(examples):
    return ['[CONV] ' + e for e in examples]

In [None]:
def preprocess(batch):
  src = tokenizer(batch['da'], 
                              padding='max_length', 
                              return_tensors='pt',
                              truncation=True,
                              max_length=180, 
                              return_attention_mask=True,
                              pad_to_max_length = True
                              )
  trg = tokenizer(batch['en'], 
                              padding='max_length', 
                              return_tensors='pt',
                              truncation=True,
                              max_length=180, 
                              return_attention_mask=True
                              )
  
  src['labels'] = trg['input_ids']
  #src['decoder_attention_mask'] = trg['attention_mask']
  return src

In [None]:
# Create mixed dataset(s)
#datadict = set_custom_dataset('Ted_prepended', ["WikiTrain", "WikiDev", "CCTrain"], [1, 1, .25], indom_over_sample=4)
datadict_2 = set_custom_dataset('Ted_prepended', ["WikiTrain", "WikiDev", "CCTrain"], [0.5, 0.5, 0.125], indom_over_sample=4) #cirricum half
datadict_3 = set_custom_dataset('Ted_prepended', ["WikiTrain", "WikiDev", "CCTrain"], [0.25, 0.25, 0.06125], indom_over_sample=4) #cirricum half

[Dataset({
    features: ['da', 'en'],
    num_rows: 261630
}), Dataset({
    features: ['da', 'en'],
    num_rows: 87211
}), Dataset({
    features: ['da', 'en'],
    num_rows: 6443166
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
})] [0.5, 0.5, 0.125, 1, 1, 1, 1]
[Dataset({
    features: ['da', 'en'],
    num_rows: 261630
}), Dataset({
    features: ['da', 'en'],
    num_rows: 87211
}), Dataset({
    features: ['da', 'en'],
    num_rows: 6443166
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
}), Dataset({
    features: ['da', 'en'],
    num_rows: 64901
})] [0.25, 0.25, 0.06125, 1, 1, 1, 1]


In [None]:
 #datadict
ted_prepended_path = os.path.join(os.getcwd(), 'data/TED_prepended')
tedprepend = datasets.load_from_disk(ted_prepended_path)
x = tedprepend['train'].remove_columns(['input_ids', 'attention_mask', 'labels'])
x['en'][0]

'[CONV] So, the temperature starts at 25 centigrade, and down it goes, down it goes. '

In [None]:
datadict

Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 2284137
})

In [None]:
datadict_2

Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 1304320
})

In [None]:
datadict_3

Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 806357
})

In [None]:
def count_percentages(dataset_item: datasets.arrow_dataset.Dataset):
  wiki_c, ted_c, cc_c, total = 0, 0 ,0, len(dataset_item)
  for item in dataset_item['origin']:
    item = item.lower()
    if 'wiki' in item:
      wiki_c += 1
    elif 'ted' in item or 'in_doms_over' in item:
      ted_c += 1
    elif 'cc' in item:
      cc_c += 1
    else:
      raise(IndexError)
  ted_pct, wiki_pct, cc_pct = ted_c/total, wiki_c/total, cc_c/total
  assert sum([ted_pct, wiki_pct, cc_pct]) == 1
  print(ted_pct, wiki_pct, cc_pct)

count_percentages(datadict_2)
count_percentages(datadict_3)

0.24879247423945045 0.13372485279685967 0.6174826729636899
0.40243341348807044 0.10815184837485134 0.48941473813707825


In [None]:
# Tokenize datadict with CONV token

# remove new line
#Datadict = datadict.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en'])}, batched=True)
# add domain token
#Datadict = Datadict.map(lambda example: {'da' : add_domain_token(example['da']), 'en': add_domain_token(example['en'])}, batched=True)
# necessary such that input_ids and so on are tensors instead of list
#Datadict = Datadict.with_format('torch')
#TokenizedData = Datadict.map(preprocess, batched=True)

In [None]:
save_path = os.path.join(os.getcwd() + '/data/TrainMixLDataLargeCONV')
save_path_2 = os.path.join(os.getcwd() + '/data/TrainMixLDataLargeCONVCirric2')
save_path_3 = os.path.join(os.getcwd() + '/data/TrainMixLDataLargeCONVCirric2')


#TokenizedData.save_to_disk(save_path)

In [None]:
#Tokenize test data
TEDTest = datasets.load_from_disk(os.path.join(os.getcwd() + '/data/TED2020'))['test']
save_path_test = os.path.join(os.getcwd() + '/data/TEDtestTokenized')

TEDTest = TEDTest.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en'])}, batched=True)
TEDTest = TEDTest.with_format('torch')

TokenizedData_test = TEDTest.map(preprocess, batched=True)

TokenizedData_test.save_to_disk(save_path_test)




  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
# remove new line
#Datadict = datadict.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en']), 'origin': example['origin']}, batched=True)
# necessary such that input_ids and so on are tensors instead of list
#Datadict = Datadict.with_format('torch')

# remove new line
Datadict_2 = datadict_2.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en']), 'origin': example['origin']}, batched=True)
# necessary such that input_ids and so on are tensors instead of list
Datadict_2 = Datadict_2.with_format('torch')


Datadict_3 = datadict_3.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en']), 'origin': example['origin']}, batched=True)
# necessary such that input_ids and so on are tensors instead of list
Datadict_3 = Datadict_3.with_format('torch')


  0%|          | 0/1305 [00:00<?, ?ba/s]

  0%|          | 0/807 [00:00<?, ?ba/s]

In [None]:
#TokenizedData = Datadict.map(preprocess, batched=True)

TokenizedData_2 = Datadict_2.map(preprocess, batched=True)
TokenizedData_3 = Datadict_3.map(preprocess, batched=True)

  0%|          | 0/1305 [00:00<?, ?ba/s]

  0%|          | 0/807 [00:00<?, ?ba/s]

In [None]:
print(len(TokenizedData['input_ids'][0]))

NameError: ignored

In [None]:
print(type(TokenizedData['input_ids'][1500]))

<class 'torch.Tensor'>


In [None]:
print(TokenizedData_2)

Dataset({
    features: ['da', 'en', 'origin', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1304320
})


In [None]:
#Specify the name of the dataset to load and mix
#datadict = set_custom_dataset('Ted', ["WikiTrain", "CCTrain"], [1,1])
save_path = os.path.join(os.getcwd() + '/data/TrainMixDataLargeCONV')


#TokenizedData.save_to_disk(save_path)

TokenizedData_2.save_to_disk(save_path_2)
TokenizedData_3.save_to_disk(save_path_3)

# Add domain tokens to TED Dev

In [None]:
ted_path = os.path.join(os.getcwd(),'data/TED2020')
TED = datasets.load_from_disk(ted_path)

DatasetDict({
    train: Dataset({
        features: ['da', 'en'],
        num_rows: 64901
    })
    test: Dataset({
        features: ['da', 'en'],
        num_rows: 3606
    })
    dev: Dataset({
        features: ['da', 'en'],
        num_rows: 3606
    })
})

In [None]:
# remove new line
Datadict = TED.map(lambda example: {'da' : removeNewLine(example['da']), 'en': removeNewLine(example['en'])}, batched=True)
# add domain token
Datadict = Datadict.map(lambda example: {'da' : add_domain_token(example['da']), 'en': add_domain_token(example['en'])}, batched=True)
# necessary such that input_ids and so on are tensors instead of list
Datadict = Datadict.with_format('torch')
TokenizedData = Datadict.map(preprocess, batched=True)



  0%|          | 0/65 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
TokenizedData.save_to_disk('/content/drive/MyDrive/MT_final_project/data/TED_prepended')

# Testing buildDataset function

In [None]:
ted_path = os.path.join(os.getcwd(), 'data/TED2020')
wiki_matrix_path = os.path.join(os.getcwd(), 'data/WikiMatrix')
cc_aligned_path = os.path.join(os.getcwd(), 'data/CCAligned')

TED = datasets.load_from_disk(ted_path)
WikiMatrix = datasets.load_from_disk(wiki_matrix_path)
CCAligned = datasets.load_from_disk(cc_aligned_path)
TEDdev = TED['dev']
WMdev = WikiMatrix['dev']
WMtest = WikiMatrix['test']

In [None]:
print(len(CCAligned['train'])*.0067)

43169.2122


In [None]:
print(len(TED['train']))

43267


In [None]:
print(len(TED['train'])/len(WikiMatrix['train']))

0.16537476589076175


In [None]:
print(len(TED['train'])/len(CCAligned['train']))

0.006715176979764296


In [None]:
buildDataset(TEDdev, [WMdev, WMtest], ["TED", "WMdev", "WMtest"], ratios=[1,1])

Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 188845
})

In [None]:
data_ratios = buildDataset(TEDdev, [WMdev, WMtest], ["TED", "WMdev", "WMtest"], ratios=[0.5,0.9])
print(data_ratios)


Dataset({
    features: ['da', 'en', 'origin'],
    num_rows: 136517
})


In [None]:
print("Ratio data:")
print(f'total length: {len(data_ratios)}')
print("--------------"*3)
print(f"individual lengths adjusted with ratio: \n TED: {len(TEDdev)} \n WMdev: {len(WMdev)*0.5} \n WMtest: {len(WMtest)*0.9} ")

print("total:", len(TEDdev) + len(WMdev)*0.5 + len(WMtest)*0.9)

Ratio data:
total length: 136517
------------------------------------------
individual lengths adjusted with ratio: 
 TED: 14423 
 WMdev: 43605.5 
 WMtest: 78489.90000000001 
total: 136518.40000000002
