In [29]:
!pip install datasets



In [30]:
from datasets import load_dataset

In [31]:
data = load_dataset("ahazeemi/iwslt14-en-fr")
data

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'en', 'fr'],
        num_rows: 179435
    })
    validation: Dataset({
        features: ['id', 'translation', 'en', 'fr'],
        num_rows: 903
    })
    test: Dataset({
        features: ['id', 'translation', 'en', 'fr'],
        num_rows: 3666
    })
})

In [32]:
data['train'], data['train'][0]

(Dataset({
     features: ['id', 'translation', 'en', 'fr'],
     num_rows: 179435
 }),
 {'id': 'docid-1_segid-1',
  'translation': {'en': 'It can be a very complicated thing, the ocean.',
   'fr': "Ca peut être très compliqué, l'océan."},
  'en': 'It can be a very complicated thing, the ocean.',
  'fr': "Ca peut être très compliqué, l'océan."})

### Tokenizers

In [33]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [34]:
sample = "Where is himalayas in the world map?"
sample2 = "Where is himalayaas in the world map?"
encoding = tokenizer.encode(sample)
encoding2 = tokenizer.encode(sample2)
encoding, encoding2

([101, 2073, 2003, 26779, 1999, 1996, 2088, 4949, 1029, 102],
 [101, 2073, 2003, 2032, 7911, 3148, 3022, 1999, 1996, 2088, 4949, 1029, 102])

In [35]:
tokenizer.convert_ids_to_tokens(encoding), tokenizer.convert_ids_to_tokens(encoding2)

(['[CLS]',
  'where',
  'is',
  'himalayas',
  'in',
  'the',
  'world',
  'map',
  '?',
  '[SEP]'],
 ['[CLS]',
  'where',
  'is',
  'him',
  '##ala',
  '##ya',
  '##as',
  'in',
  'the',
  'world',
  'map',
  '?',
  '[SEP]'])

In [36]:
q1 = "Where is himalayas situated?"
a1 = "Himalayas are situated in Nepal, and is the highest mountain peak on the earth."

encoding = tokenizer.encode_plus(q1, a1)

# input_ids: token considering join of q1 and a1
# token_type_ids: 0: 1st sentence, 1: second sentence
# attention_mask: attention with masking(padding)
print(encoding)

{'input_ids': [101, 2073, 2003, 26779, 4350, 1029, 102, 26779, 2024, 4350, 1999, 8222, 1010, 1998, 2003, 1996, 3284, 3137, 4672, 2006, 1996, 3011, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [37]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'])

['[CLS]',
 'where',
 'is',
 'himalayas',
 'situated',
 '?',
 '[SEP]',
 'himalayas',
 'are',
 'situated',
 'in',
 'nepal',
 ',',
 'and',
 'is',
 'the',
 'highest',
 'mountain',
 'peak',
 'on',
 'the',
 'earth',
 '.',
 '[SEP]']

In [38]:
q2 = "How is himalayas?"
a2 = "Mysterious and white."

encoding = tokenizer.encode_plus([q1, q2], [a1, a2])
encoding

{'input_ids': [101, 100, 100, 102, 100, 100, 102], 'token_type_ids': [0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [39]:
tokenizer([q1, q2], [a1, a2])

{'input_ids': [[101, 2073, 2003, 26779, 4350, 1029, 102, 26779, 2024, 4350, 1999, 8222, 1010, 1998, 2003, 1996, 3284, 3137, 4672, 2006, 1996, 3011, 1012, 102], [101, 2129, 2003, 26779, 1029, 102, 8075, 1998, 2317, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [40]:
# Adding padding (Adds 0 to the right)
# Model will only focus where attention mask is 1

tokenizer([q1, q2], [a1, a2], padding=True)

{'input_ids': [[101, 2073, 2003, 26779, 4350, 1029, 102, 26779, 2024, 4350, 1999, 8222, 1010, 1998, 2003, 1996, 3284, 3137, 4672, 2006, 1996, 3011, 1012, 102], [101, 2129, 2003, 26779, 1029, 102, 8075, 1998, 2317, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [41]:
encoding = tokenizer.batch_encode_plus([q1, q2], [a1, a2], padding=True)
encoding

{'input_ids': [[101, 2073, 2003, 26779, 4350, 1029, 102], [101, 2129, 2003, 26779, 1029, 102, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]]}

In [42]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]), 
tokenizer.convert_ids_to_tokens(encoding['input_ids'][1])

['[CLS]', 'how', 'is', 'himalayas', '?', '[SEP]', '[PAD]']

In [43]:
# Distilbert tokenizer

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer.batch_encode_plus([q1, q2], [a1, a2], padding=True)

{'input_ids': [[101, 2073, 2003, 26779, 4350, 1029, 102], [101, 2129, 2003, 26779, 1029, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]]}

In [47]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]', 'where', 'is', 'himalayas', 'situated', '?', '[SEP]']

## Text tokenization

In [48]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [49]:
def simple_tokenizer(text):
    return text.lower().split()

# Tokenize the dataset
def tokenize_data(batch):
    # return_tensors="pt": returns pytorch tensors
    # padding=True: adds padding to the attention mask
    # truncation=True: truncate to a maximum length specified by the max_length argument or the maximum length accepted by the model if no max_length is provided

    # Tokenized source
    tokenized_src = tokenizer(batch['en'], padding=True, truncation=True, return_tensors="pt")

    # Tokenized target
    tokenized_tgt = [simple_tokenizer(sentence) for sentence in batch['fr']]

    return {
        'input_ids': tokenized_src['input_ids'],
        'attention_mask': tokenized_src['attention_mask'],
        'labels': tokenized_tgt
    }

tokenized_data = data.map(tokenize_data, batched=True, remove_columns=['en', 'fr'])

Map:   0%|          | 0/179435 [00:00<?, ? examples/s]

Map:   0%|          | 0/903 [00:00<?, ? examples/s]

Map:   0%|          | 0/3666 [00:00<?, ? examples/s]

In [51]:
tokenized_data,data

(DatasetDict({
     train: Dataset({
         features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
         num_rows: 179435
     })
     validation: Dataset({
         features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
         num_rows: 903
     })
     test: Dataset({
         features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
         num_rows: 3666
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['id', 'translation', 'en', 'fr'],
         num_rows: 179435
     })
     validation: Dataset({
         features: ['id', 'translation', 'en', 'fr'],
         num_rows: 903
     })
     test: Dataset({
         features: ['id', 'translation', 'en', 'fr'],
         num_rows: 3666
     })
 }))

In [52]:
tokenized_data['train'][0], data['train'][0]

({'id': 'docid-1_segid-1',
  'translation': {'en': 'It can be a very complicated thing, the ocean.',
   'fr': "Ca peut être très compliqué, l'océan."},
  'input_ids': [101,
   2009,
   2064,
   2022,
   1037,
   2200,
   8552,
   2518,
   1010,
   1996,
   4153,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   0,
   0,

In [53]:
tokenized_data['train'][0].keys()

dict_keys(['id', 'translation', 'input_ids', 'attention_mask', 'labels'])

**Now the data is tokenized and we need to make the model.**

# Defining models

### Defining encoder
### Defining decoder
### Defining Seq2Seq

In [55]:
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder = BertTokenizer.from_pretrained("bert-base-uncased").to(device)

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=1):
        super(Decoder, self).__init__() # initialize nn.Module part of Decoder
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, cell):
        embedded = self.embedding()

device(type='cpu')