In [1]:
import os
import re

In [2]:
DIR_path = "D:\MSc Data Science\Advanced Modules\[INF-DSAM1B] Advanced Machine Learning B\Deep learning for NLP\Project\Machine translation with attention"
english_data_path = "Data\\es-en"
spanish_data_path = "Data\\es-en"

In [3]:
# 'utf-8' removes b'' character string literal
# splitlines() remove newline character
with open(os.path.join(DIR_path, english_data_path, "europarl-v7.es-en.en"), "rb") as f:
    content_english = f.read().decode("utf-8").splitlines()

#num_eng_words = 0
#for i in content_english:
#    num_eng_words += len(i.split(" "))
#print("Number of english words: ", num_eng_words)

In [4]:
with open(os.path.join(DIR_path, spanish_data_path, "europarl-v7.es-en.es"), "rb") as f:
    content_spanish = f.read().decode("utf-8").splitlines()

#num_spn_words = 0
#for i in content_spanish:
#    num_spn_words += len(i.split(" "))
#print("Number of spanish words: ", num_spn_words)

In [5]:
def sent_preprocess(sentence):
    sentence=sentence.lower()             
    sentence = re.sub(r"[-,.!?()]+", r"", sentence)
    return sentence

_patterns = [r'\'',
             r'\"',
             r'\.',
             r'<br \/>',
             r',',
             r'\(',
             r'\)',
             r'\!',
             r'\?',
             r'\;',
             r'\:',
             r'\s+']

_replacements = [' \'  ',
                 '',
                 ' . ',
                 ' ',
                 ' , ',
                 ' ( ',
                 ' ) ',
                 ' ! ',
                 ' ? ',
                 ' ',
                 ' ',
                 ' ']

_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))

def sentence_preprocess(sentence):
    """https://pytorch.org/text/_modules/torchtext/data/utils.html"""
    sentence = sentence.lower()
    for pattern_re, replaced_str in _patterns_dict:
        sentence = pattern_re.sub(replaced_str, sentence)
    
    return sentence

In [6]:
# preprocess the english sentence
sentence_english = []
for sent in content_english:
    sentence_english.append(sentence_preprocess(sent))
print("total english sentences: ", len(sentence_english))

total english sentences:  1965734


In [7]:
# preprocess the spanish sentence
sentence_spanish = []
for sent in content_spanish:
    sentence_spanish.append(sentence_preprocess(sent))
print("total spanish sentences: ", len(sentence_spanish))

total spanish sentences:  1965734


### Tokenize the data

In [8]:
import torch
import nltk
from tqdm import tqdm


In [9]:
# Loop over wach of the sentence and tokenize eacch sentenec separately.
# will take some time to tokenize each sentence.
english_tokenized_text = [ nltk.word_tokenize(sentence_english[i], language="english") for i in tqdm(range(len(sentence_english[:500]))) ]

# create word index
# assign each word a number.
word_to_index = {}
words=[]
for sentence in english_tokenized_text:
    for word in sentence:
        words.append(word)
UNIQUE_WORDS = set(words)

for index, word in enumerate(UNIQUE_WORDS):
    word_to_index[word] = index

# add tokens: <SOS> and <EOS>
word_to_index["<SOS>"] = list(word_to_index.values())[-1] + 1
word_to_index["<EOS>"] = list(word_to_index.values())[-1] + 1

# using word index, create tensor
# convert each of the sentence into numbers.
english_tokenized_tensor = []

for sentence in english_tokenized_text:
    #english_tokenized_tensor.append( [word_to_index[word] for word in sentence]  )
    tensor_list=[]
    tensor_list.append(word_to_index["<SOS>"])
    tensor_list = tensor_list + [word_to_index[word] for word in sentence]
    tensor_list.append(word_to_index["<EOS>"])
    english_tokenized_tensor.append(tensor_list)
    
data_english=[]
for i in range(len(english_tokenized_text)):
    data_english.append({"TOKENIZED WORD": english_tokenized_text[i] , 
             "TENSOR": torch.tensor(english_tokenized_tensor[i], dtype=torch.long), 
             "LANGUAGE": "ENGLISH"})

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 3130.69it/s]


In [13]:
# this is how we access the data
###data_english[0]["TENSOR"]
data_english[0:3]

[{'TOKENIZED WORD': ['resumption', 'of', 'the', 'session'],
  'TENSOR': tensor([2348, 1510,  959,  971, 2224, 2349]),
  'LANGUAGE': 'ENGLISH'},
 {'TOKENIZED WORD': ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999',
   ',',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period',
   '.'],
  'TENSOR': tensor([2348,  424, 2152, 1245,  971, 2224,  959,  971, 1084,  754, 2125,  963,
           943,  196,  402,   39, 1934, 1044,  424, 1227,  891,  854,  796, 1966,
          1886,  847,  918, 1733,   93, 2187, 1148,  971, 1317, 2075,  847,  829,
           918, 1606, 2214, 2082,  616, 2349]),
  'LANGUAGE': 'ENGLISH'},
 {'TOKENIZED WORD': ['although',
   ',',
   'as',
   'you

In [11]:
# Loop over wach of the sentence and tokenize eacch sentenec separately.
# will take some time to tokenize each sentence.
spanish_tokenized_text = [ nltk.word_tokenize(sentence_spanish[i], language="spanish") for i in tqdm(range(len(sentence_spanish[:500]))) ]

# create word index
# assign each word a number.
word_to_index = {}
words=[]
for sentence in spanish_tokenized_text:
    for word in sentence:
        words.append(word)
UNIQUE_WORDS = set(words)

for index, word in enumerate(UNIQUE_WORDS):
    word_to_index[word] = index

# add tokens: <SOS> and <EOS>
word_to_index["<SOS>"] = list(word_to_index.values())[-1] + 1
word_to_index["<EOS>"] = list(word_to_index.values())[-1] + 1

# using word index, create tensor
# convert each of the sentence into numbers.
spanish_tokenized_tensor = []

for sentence in spanish_tokenized_text:
    
    tensor_list=[]
    tensor_list.append(word_to_index["<SOS>"])
    tensor_list = tensor_list + [word_to_index[word] for word in sentence]
    tensor_list.append(word_to_index["<EOS>"])
    spanish_tokenized_tensor.append(tensor_list)
    
data_spanish=[]
for i in range(len(spanish_tokenized_text)):
    data_spanish.append({"TOKENIZED WORD": spanish_tokenized_text[i] , 
             "TENSOR": torch.tensor(spanish_tokenized_tensor[i], dtype=torch.long), 
             "LANGUAGE": "ENGLISH"})

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 2653.20it/s]


In [14]:
data_spanish[0:3]

[{'TOKENIZED WORD': ['reanudación', 'del', 'período', 'de', 'sesiones'],
  'TENSOR': tensor([2854, 2018, 1057,  915,  909,  307, 2855]),
  'LANGUAGE': 'ENGLISH'},
 {'TOKENIZED WORD': ['declaro',
   'reanudado',
   'el',
   'período',
   'de',
   'sesiones',
   'del',
   'parlamento',
   'europeo',
   ',',
   'interrumpido',
   'el',
   'viernes',
   '17',
   'de',
   'diciembre',
   'pasado',
   ',',
   'y',
   'reitero',
   'a',
   'sus',
   'señorías',
   'mi',
   'deseo',
   'de',
   'que',
   'hayan',
   'tenido',
   'unas',
   'buenas',
   'vacaciones',
   '.'],
  'TENSOR': tensor([2854, 1722, 2624, 1694,  915,  909,  307, 1057, 2756, 1499, 2369,   27,
          1694,  552,  223,  909, 1663,  234, 2369, 1426,  141, 1133,  895, 2170,
          2647, 2260,  909, 2785,  209,  577, 1044, 1819, 2819,  775, 2855]),
  'LANGUAGE': 'ENGLISH'},
 {'TOKENIZED WORD': ['como',
   'todos',
   'han',
   'podido',
   'comprobar',
   ',',
   'el',
   'gran',
   'efecto',
   'del',
   'año',
   '200