In [15]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
import random
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [22]:
def process_line_fit_tokenizer(line):
    # Check if the line should be skipped
    if ('(EC)' in line) or ('(EC)' in line):
        return None

    # Split the line by the tab character
    segments = line.split('\t')
    segments = list(map(lambda x: x.lower(), segments))
    if len(segments) >= 2:
        # Extract English and Spanish segments
        english = segments[0].strip()
        spanish = segments[1].strip()

        # Remove numbers, special characters and extra whitespaces
        english = re.sub(r'[^A-Za-zñÑáéíóúÁÉÍÓÚüÜ]+', ' ', english).strip()
        spanish = re.sub(r'[^A-Za-zñÑáéíóúÁÉÍÓÚüÜ]+', ' ', spanish).strip()

        return english, spanish
    else:
        return None
    

def process_line_getitem(line):
    # Check if the line should be skipped
    if ('(EC)' in line) or ('(EC)' in line):
        return None, None

    # Split the line by the tab character
    segments = line.split('\t')
    segments = list(map(lambda x: x.lower(), segments))

    if len(segments) >= 2:
        # Extract English and Spanish segments
        english = segments[0].strip()
        spanish = segments[1].strip()

        # Remove numbers, special characters and extra whitespaces
        english = re.sub(r'[^A-Za-zñÑáéíóúÁÉÍÓÚüÜ]+', ' ', english).strip()
        spanish = re.sub(r'[^A-Za-zñÑáéíóúÁÉÍÓÚüÜ]+', ' ', spanish).strip()

        return english, spanish

    return None, None


def reservoir_sampling(file, start, end, num_lines_to_use):
    reservoir = [None] * num_lines_to_use
    current_line_number = 0
    
    for line in file:
        if current_line_number >= start and current_line_number < end:
            i = current_line_number - start
            if i < num_lines_to_use:
                reservoir[i] = line
            else:
                j = np.random.randint(0, i + 1)
                if j < num_lines_to_use:
                    reservoir[j] = line
        current_line_number += 1

    return reservoir


#### This data generator loads batch by batch. It is very ineficient. I would like to load a percentage and determine a maximum number of lines. 

In [17]:
#For selecting the data we use the reservoir sampling algorithm. It is used in the _fit_tokenizer method and
#also in the __getitem__ method. 
'''
Reservoir sampling is a randomized algorithm for choosing a simple random sample of k items from a list (or stream) containing n items, where n is either a very large or an unknown number. The reservoir sampling algorithm was introduced by Alan G. Waterman in 1978 and is particularly useful when n is large or the input list is in the form of a data stream that cannot be stored entirely in memory.

The algorithm works as follows:

    1. Create an empty reservoir (an array or list) of size k to store the selected items.
    2. Fill the reservoir with the first k items of the input list.
    3. For each item in the list after the k-th item (i.e., from item k+1 to item n), do the following:
        a. Generate a random integer j between 0 (inclusive) and the current item's index (inclusive).
        b. If j < k, replace the j-th item in the reservoir with the current item.
    4. After processing all the items in the list, the reservoir will contain a simple random sample of k items.

The reservoir sampling algorithm ensures that each item in the input list has an equal probability of being included in the final sample. The algorithm has a time complexity of O(n) and is memory-efficient, as it only needs to store k items at any given time.

I

''' 

class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, file_path, batch_size=32, max_sequence_length=100, max_words=None, shuffle=True, tokenizer_data_percentage = .05, training_data_percentage = .05):
        self.file_path = file_path
        self.batch_size = batch_size
        self.max_sequence_length = max_sequence_length
        self.max_words = max_words
        self.shuffle = shuffle
        self.tokenizer_data_percentage = tokenizer_data_percentage
        self.training_data_percentage = training_data_percentage

        #Calculating total lines in the file:
        with open(self.file_path, "r", encoding="utf-8") as f:
            self.total_lines = sum(1 for line in f)

        self.selected_line_indices = self._reservoir_sampling_indices()

        self.english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words, filters="", lower=False)
        self.spanish_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words, filters="", lower=False)

        self._fit_tokenizers()

    def _reservoir_sampling_indices(self):
        num_lines_to_use = int(self.total_lines * self.training_data_percentage)
        reservoir_indices = list(range(num_lines_to_use))

        for i in range(num_lines_to_use, self.total_lines):
            j = np.random.randint(0, i + 1)
            if j < num_lines_to_use:
                reservoir_indices[j] = i

        return reservoir_indices
    


    def _fit_tokenizers(self):

        num_lines_to_use = int(self.total_lines * self.tokenizer_data_percentage)
        
        selected_lines = [None] * num_lines_to_use

        with open(self.file_path, "r", encoding="utf-8") as f:

            for line_idx, line in tqdm(enumerate(f), total = self.total_lines, desc = 'Reservoir sampling'):
                if line_idx < num_lines_to_use:
                    selected_lines[line_idx] = line
                else:
                    r = random.randint(0, line_idx)
                    if r < num_lines_to_use:
                        selected_lines[r] = line


            #This is an alternative for loading the whole document and then sample
            #lines = f.readlines()
            #total_lines = len(lines)
            #Randomly select 10% of the lines
            #selected_lines = np.random.choice(lines, num_lines_to_use, replace = False)

            #We will clean and store the line results
        # Process the lines in parallel
        with ThreadPoolExecutor() as executor:
            processed_segments = list(tqdm(executor.map(process_line_fit_tokenizer, selected_lines), total=len(selected_lines), desc="Processing lines"))

        # Filter out None values and split the results into separate English and Spanish lists
        english_segments = [x[0] for x in processed_segments if x is not None]
        spanish_segments = [x[1] for x in processed_segments if x is not None]

        # Fit the English and Spanish tokenizer with the cleaned selected lines
        self.english_tokenizer.fit_on_texts(english_segments)
        self.spanish_tokenizer.fit_on_texts(spanish_segments)

    def __len__(self):
        with open(self.file_path, "r", encoding="utf-8") as f:
            num_lines = sum(1 for line in f)
            num_lines = int(num_lines * self.training_data_percentage) 
        return int(np.floor(num_lines / self.batch_size))

    def __getitem__(self, index):
        
        start = index * self.batch_size
        end = (index + 1) * self.batch_size
        #self.selected_line_indices is the indices obtain from the reservoir sampling.
        selected_line_indices = self.selected_line_indices[start:end]

        selected_lines = []
        with open(self.file_path, "r", encoding="utf-8") as f:
            for line_index, line in enumerate(f):
                if line_index in selected_line_indices:
                    selected_lines.append(line)
                    selected_line_indices.remove(line_index)
                    if not selected_line_indices:
                        break

            # Process the selected lines using a ThreadPoolExecutor
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(process_line_getitem, selected_lines))

            english_batch, spanish_batch = [], []

            # Results is a tuple with an English list and a Spanish list
            for eng, spa in results:
                if eng is not None and spa is not None:
                    english_batch.append(eng)
                    spanish_batch.append(spa)

                    if len(english_batch) >= self.batch_size:
                        break

            english_sequences = self.english_tokenizer.texts_to_sequences(english_batch)
            spanish_sequences = self.spanish_tokenizer.texts_to_sequences(spanish_batch)

            padded_english_sequences = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, padding="post")
            padded_spanish_sequences = tf.keras.preprocessing.sequence.pad_sequences(spanish_sequences, padding="post")

            # The categorical target data must be used with a 'categorical_crossentropy' loss function.
            #target_data = tf.keras.utils.to_categorical(padded_spanish_sequences, num_classes=self.max_words + 1)

            return [padded_english_sequences, padded_spanish_sequences[:, :-1]], padded_spanish_sequences[:, 1:]


    def on_epoch_end(self):
        if self.shuffle:
            with open(self.file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            np.random.shuffle(lines)
            with open(self.file_path, "w", encoding="utf-8") as f:
                f.writelines(lines)

In [18]:
#For selecting the data we use the reservoir sampling algorithm. It is used in the _fit_tokenizer method and
#also in the __getitem__ method. 
'''
Reservoir sampling is a randomized algorithm for choosing a simple random sample of k items from a list (or stream) containing n items, where n is either a very large or an unknown number. The reservoir sampling algorithm was introduced by Alan G. Waterman in 1978 and is particularly useful when n is large or the input list is in the form of a data stream that cannot be stored entirely in memory.

The algorithm works as follows:

    1. Create an empty reservoir (an array or list) of size k to store the selected items.
    2. Fill the reservoir with the first k items of the input list.
    3. For each item in the list after the k-th item (i.e., from item k+1 to item n), do the following:
        a. Generate a random integer j between 0 (inclusive) and the current item's index (inclusive).
        b. If j < k, replace the j-th item in the reservoir with the current item.
    4. After processing all the items in the list, the reservoir will contain a simple random sample of k items.

The reservoir sampling algorithm ensures that each item in the input list has an equal probability of being included in the final sample. The algorithm has a time complexity of O(n) and is memory-efficient, as it only needs to store k items at any given time.

I

''' 

class MemoryDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, file_path, batch_size=32, max_words=None, shuffle=True, tokenizer_data_percentage = .05, training_data_percentage = .05):
        self.file_path = file_path
        self.batch_size = batch_size
        self.max_words = max_words
        self.shuffle = shuffle
        self.tokenizer_data_percentage = tokenizer_data_percentage
        self.training_data_percentage = training_data_percentage

        
        with open(self.file_path, "r", encoding="utf-8") as f:
            self.total_lines = sum(1 for line in f)

        with open(self.file_path, "r", encoding="utf-8") as f:

            self.lines = reservoir_sampling(f, 0, self.total_lines, int(self.total_lines * self.training_data_percentage))
        


        self.english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words, filters="", lower=False)
        self.spanish_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words, filters="", lower=False)

        self._fit_tokenizers()

    def _reservoir_sampling_indices(self):
        num_lines_to_use = int(self.total_lines * self.training_data_percentage)
        reservoir_indices = list(range(num_lines_to_use))

        for i in range(num_lines_to_use, self.total_lines):
            j = np.random.randint(0, i + 1)
            if j < num_lines_to_use:
                reservoir_indices[j] = i

        return reservoir_indices
    


    def _fit_tokenizers(self):

        num_lines_to_use = int(self.total_lines * self.tokenizer_data_percentage)
        
        selected_lines = [None] * num_lines_to_use

        with open(self.file_path, "r", encoding="utf-8") as f:

            for line_idx, line in tqdm(enumerate(f), total = self.total_lines, desc = 'Reservoir sampling'):
                if line_idx < num_lines_to_use:
                    selected_lines[line_idx] = line
                else:
                    r = random.randint(0, line_idx)
                    if r < num_lines_to_use:
                        selected_lines[r] = line


            #This is an alternative for loading the whole document and then sample
            #lines = f.readlines()
            #total_lines = len(lines)
            #Randomly select 10% of the lines
            #selected_lines = np.random.choice(lines, num_lines_to_use, replace = False)

            #We will clean and store the line results
        # Process the lines in parallel
        with ThreadPoolExecutor() as executor:
            processed_segments = list(tqdm(executor.map(process_line_fit_tokenizer, selected_lines), total=len(selected_lines), desc="Processing lines"))

        # Filter out None values and split the results into separate English and Spanish lists
        english_segments = [x[0] for x in processed_segments if x is not None]
        spanish_segments = [x[1] for x in processed_segments if x is not None]

        # Fit the English and Spanish tokenizer with the cleaned selected lines
        self.english_tokenizer.fit_on_texts(english_segments)
        self.spanish_tokenizer.fit_on_texts(spanish_segments)

    def __len__(self):
        with open(self.file_path, "r", encoding="utf-8") as f:
            num_lines = sum(1 for line in f)
            num_lines = int(num_lines * self.training_data_percentage) 
        return int(np.floor(num_lines / self.batch_size))

    def __getitem__(self, index):
        
        start = index * self.batch_size
        end = (index + 1) * self.batch_size
        #self.selected_line_indices is the indices obtain from the reservoir sampling.
        selected_lines = self.lines[start:end]

        # Process the selected lines using a ThreadPoolExecutor
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(process_line_getitem, selected_lines))

        english_batch, spanish_batch = [], []

        # Results is a tuple with an English list and a Spanish list
        for eng, spa in results:
            if eng is not None and spa is not None:
                english_batch.append(eng)
                spanish_batch.append(spa)

                if len(english_batch) >= self.batch_size:
                    break

        english_sequences = self.english_tokenizer.texts_to_sequences(english_batch)
        spanish_sequences = self.spanish_tokenizer.texts_to_sequences(spanish_batch)

        padded_english_sequences = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, padding="post")
        padded_spanish_sequences = tf.keras.preprocessing.sequence.pad_sequences(spanish_sequences, padding="post")

        # The categorical target data must be used with a 'categorical_crossentropy' loss function.
        #target_data = tf.keras.utils.to_categorical(padded_spanish_sequences, num_classes=self.max_words + 1)

        return [padded_english_sequences, padded_spanish_sequences[:, :-1]], padded_spanish_sequences[:, 1:]


    def on_epoch_end(self):
        if self.shuffle:
            with open(self.file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            np.random.shuffle(lines)
            with open(self.file_path, "w", encoding="utf-8") as f:
                f.writelines(lines)

In [23]:
def translate(model, english_sentence, data_generator):
    # Tokenize the English sentence
    english_sequence = data_generator.english_tokenizer.texts_to_sequences([english_sentence])
    padded_english_sequence = tf.keras.preprocessing.sequence.pad_sequences(english_sequence, padding="post")

    # Initialize the Spanish sequence with the <START> token
    start_token_index = data_generator.spanish_tokenizer.word_index['<START>']
    spanish_sequence = np.zeros((1, 1), dtype=np.int32)
    spanish_sequence[0, 0] = start_token_index

    spanish_tokens = []

    # Generate the Spanish translation one token at a time
    while True:
        # Get the model's prediction
        output_tokens = model.predict([padded_english_sequence, spanish_sequence])

        # Choose the most probable token
        predicted_token_index = np.argmax(output_tokens[0, -1])

        # Stop if the <END> token is predicted or the maximum number of tokens is reached
        if predicted_token_index == data_generator.spanish_tokenizer.word_index['<END>'] or len(spanish_tokens) >= 100:
            break

        # Append the predicted token to the current Spanish sequence
        spanish_tokens.append(predicted_token_index)
        spanish_sequence = np.hstack([spanish_sequence, np.array(predicted_token_index).reshape(1, 1)])

    # Convert the Spanish tokens to a sentence
    spanish_sentence = data_generator.spanish_tokenizer.sequences_to_texts([spanish_tokens])[0]
    return spanish_sentence


In [19]:
#file_path = '.\EN-ES.txt\EN-ES.txt'

#batch_size = 32
#max_sequence_length = 50
#max_words = 1000

#data_gen = DataGenerator(file_path, batch_size=batch_size, max_sequence_length=max_sequence_length, max_words=max_words, tokenizer_data_percentage= .01, training_data_percentage = .01)

In [20]:
#first_batch = data_gen.__getitem__(0)

In [21]:
#display(first_batch)