Data preprocessing

In [1]:
#Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import pickle

2023-08-01 16:50:11.319322: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Load the data

#data
pkl_data = pickle.load(open("../../../../../data/raw/translator-data/en-es/english-spanish.pkl", 'rb'))
data = pd.DataFrame(pkl_data, columns=['english', 'spanish'])

In [3]:
data.head()

Unnamed: 0,english,spanish
0,go,ve
1,go,vete
2,go,vaya
3,go,vayase
4,hi,hola


In [4]:
print(data['spanish'][122000])
print(data['english'][122000])

solo porque a el le gusta pintar no significa que sea bueno para ello
just because he likes painting doesnt mean hes good at painting


In [5]:
#Tokenizer
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data['english'], target_vocab_size=2**13)
tokenizer_es = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data['spanish'], target_vocab_size=2**13)

In [6]:
#Define vocabulary size
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2 # = 8401
VOCAB_SIZE_ES = tokenizer_es.vocab_size + 2 # = 8084

In [7]:
print('English vocabulary size', VOCAB_SIZE_EN)
print('Spanish vocabulary size', VOCAB_SIZE_ES)

English vocabulary size 8401
Spanish vocabulary size 8084


In [8]:
english_data = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1]
          for sentence in data['english']]
spanish_data = [[VOCAB_SIZE_ES-2] + tokenizer_es.encode(sentence) + [VOCAB_SIZE_ES-1]
           for sentence in data['spanish']]

In [9]:
english_data

[[8399, 228, 8400],
 [8399, 228, 8400],
 [8399, 228, 8400],
 [8399, 228, 8400],
 [8399, 2718, 8400],
 [8399, 1386, 8400],
 [8399, 1386, 8400],
 [8399, 1386, 8400],
 [8399, 1386, 8400],
 [8399, 1386, 8400],
 [8399, 6524, 8400],
 [8399, 4212, 8262, 8400],
 [8399, 616, 8400],
 [8399, 616, 8400],
 [8399, 616, 8400],
 [8399, 236, 8400],
 [8399, 236, 8400],
 [8399, 236, 8400],
 [8399, 5719, 8400],
 [8399, 5719, 8400],
 [8399, 938, 8400],
 [8399, 938, 8400],
 [8399, 938, 8400],
 [8399, 1097, 8400],
 [8399, 1097, 8400],
 [8399, 42, 273, 8400],
 [8399, 42, 273, 8400],
 [8399, 6948, 8400],
 [8399, 1101, 8400],
 [8399, 1101, 8400],
 [8399, 1101, 8400],
 [8399, 2, 7754, 8400],
 [8399, 2, 7754, 8400],
 [8399, 2, 7754, 8400],
 [8399, 2, 7754, 8400],
 [8399, 2, 2866, 8400],
 [8399, 2, 2866, 8400],
 [8399, 2, 1342, 8400],
 [8399, 2, 2927, 8400],
 [8399, 2781, 1208, 8400],
 [8399, 2460, 8400],
 [8399, 3480, 8400],
 [8399, 3480, 8400],
 [8399, 3480, 8400],
 [8399, 3480, 8400],
 [8399, 3480, 8400],
 [839

In [10]:
spanish_data

[[8082, 881, 8083],
 [8082, 881, 147, 8083],
 [8082, 1839, 8083],
 [8082, 2839, 7927, 8083],
 [8082, 4617, 8083],
 [8082, 1864, 8083],
 [8082, 3591, 673, 8083],
 [8082, 3591, 7923, 8083],
 [8082, 1864, 7926, 8083],
 [8082, 1864, 7926, 8083],
 [8082, 6027, 8083],
 [8082, 3672, 7927, 8083],
 [8082, 966, 8083],
 [8082, 3431, 8083],
 [8082, 6229, 7926, 8083],
 [8082, 263, 8083],
 [8082, 6397, 732, 584, 2648, 1477, 2267, 8083],
 [8082, 2648, 1477, 2267, 8083],
 [8082, 4529, 7923, 8083],
 [8082, 4529, 7927, 8083],
 [8082, 5378, 8083],
 [8082, 1724, 8083],
 [8082, 2089, 8083],
 [8082, 2898, 8083],
 [8082, 2107, 7936, 8083],
 [8082, 1963, 8083],
 [8082, 6259, 8083],
 [8082, 4617, 8083],
 [8082, 5840, 2132, 8083],
 [8082, 217, 114, 2132, 8083],
 [8082, 3221, 7858, 2132, 8083],
 [8082, 12, 7473, 7927, 8083],
 [8082, 12, 2900, 8083],
 [8082, 12, 7473, 2750, 8083],
 [8082, 12, 2900, 7923, 8083],
 [8082, 1962, 8083],
 [8082, 1962, 7923, 8083],
 [8082, 14, 6575, 8083],
 [8082, 61, 5152, 8083],
 [808

In [12]:
MAX_LENGTH = 15
idx_to_remove = [count for count, sent in enumerate(english_data)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del english_data[idx]
    del spanish_data[idx]
idx_to_remove = [count for count, sent in enumerate(spanish_data)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del english_data[idx]
    del spanish_data[idx]

In [13]:
#Padding
english_data = tf.keras.preprocessing.sequence.pad_sequences(english_data,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
spanish_data = tf.keras.preprocessing.sequence.pad_sequences(spanish_data,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [14]:
english_data

array([[8399,  228, 8400, ...,    0,    0,    0],
       [8399,  228, 8400, ...,    0,    0,    0],
       [8399,  228, 8400, ...,    0,    0,    0],
       ...,
       [8399,   13, 1333, ...,  200, 8400,    0],
       [8399,  588,    4, ...,  306,  607, 8400],
       [8399,    4,  102, ..., 4552, 1455, 8400]], dtype=int32)

In [15]:
spanish_data

array([[8082,  881, 8083, ...,    0,    0,    0],
       [8082,  881,  147, ...,    0,    0,    0],
       [8082, 1839, 8083, ...,    0,    0,    0],
       ...,
       [8082,   18, 1437, ..., 8083,    0,    0],
       [8082,    3,  240, ...,  218, 8083,    0],
       [8082,  168,  452, ..., 3954, 8083,    0]], dtype=int32)

In [19]:
enc_inputs = spanish_data
dec_inputs = english_data[:, :-1]
dec_outputs = english_data[:, 1:]

In [24]:
SAVE_PATH = "../../../../../data/processed/translator-data/es-en/dataset.npz"

np.savez(SAVE_PATH, enc_inputs=enc_inputs, dec_inputs=dec_inputs, dec_outputs=dec_outputs)

In [69]:
#Save tokenizers
with open('../../../../../exports/translator/tokenizers/english/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer_en, f)

with open('../../../../../exports/translator/tokenizers/spanish/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer_es, f)
