In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tf_text
from sklearn.model_selection import train_test_split

2024-02-05 16:18:25.459124: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-05 16:18:25.491214: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-05 16:18:25.491247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-05 16:18:25.492013: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-05 16:18:25.496961: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-05 16:18:25.497820: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
def get_raw_data(path, filename):
    """
    Pull raw data from the provided filepath
    """
    filepath = os.path.join(path, filename)
    with open(filepath, "r") as f:
        sentences = f.readlines()
    return sentences

In [3]:
def split_data(raw_data, train_size=0.8, test_size=0.2, val_size=0.2):
    """
    Input: raw_data(array), train_size, test_size, val_size(Percentage)
    Shuffles the data and splits it into (input, output).
    Returns train_data, test_data, val_data
    """
    raw_data = np.array([sen.split('\t') for sen in raw_data])
    np.random.shuffle(raw_data)
    X, y = raw_data[:,0], raw_data[:,1]

    X, X_test, y, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size)
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, test_size=val_size)
    
    return X_train, y_train, X_test, y_test, X_val, y_val

In [4]:
def context_text_preprocess(context):
    context = tf.ragged.constant(context)
    context = tf.strings.lower(context)
    context = tf.strings.regex_replace(context, "[^a-z0-9.?!,¿ ]", "")
    context = tf.strings.regex_replace(context, "[.?!,¿]", r" \0 ")
    context = tf.strings.strip(context)
    return context
    
def target_text_preprocess(target):
    target = tf.ragged.constant(target)
    target = tf.strings.lower(target)
    target = tf.strings.regex_replace(target, "[^א-ת0-9a-z.?!,¿ ]", "")
    target = tf.strings.regex_replace(target, "[.?!,¿]", r" \0 ")
    target = tf.strings.strip(target)

    target = tf.strings.join(["[SOS]", target, "[EOS]"], separator=' ' )
    
    target = target.numpy()
    target = [sen.decode('utf-8') for sen in target]
    
    return target

In [5]:
raw_data = get_raw_data("./datasets/sentences", "heb.txt")
X_train, y_train, X_test, y_test, X_val, y_val = split_data(raw_data)

In [6]:
context = context_text_preprocess(X_train)
target = target_text_preprocess(y_train)

2024-02-05 16:18:28.435762: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-05 16:18:28.448937: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
context_vectorization = keras.layers.TextVectorization(max_tokens=5000)
target_vectorization = keras.layers.TextVectorization(max_tokens=5000)

context_vectorization.adapt(context)
target_vectorization.adapt(target)

In [8]:
class Encoder(keras.layers.Layer):
    def __init__(self, units, text_processor, **kwargs):
        super().__init__(**kwargs)

        self.units = units
        self.text_processor = text_processor
        self.vocab = self.text_processor.get_vocabulary()
        
        
        self.embedding_layer = keras.layers.Embedding(input_dim=len(self.vocab), output_dim=self.units, mask_zero=True)
        self.rnn = keras.layers.LSTM(self.units, return_sequences=True, return_state=True)

    def call(self, inputs):
        x = inputs
        x = self.text_processor(x)
        self.embedding = self.embedding_layer(x)
        self.context, self.hidden_state, self.cell_state = self.rnn(self.embedding)
        return self.context

In [9]:
class CrossAttention(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.attention = keras.layers.Attention()
        self.layer_norm = keras.layers.LayerNormalization()
        self.add = keras.layers.Add()
    
    def call(self, value, query):

        attention_output, attention_scores = self.attention([query, value], return_attention_scores=True)
        self.attention_weights = attention_scores
        x = self.add([query, attention_output])
        x = self.layer_norm(x)

        return x
    

In [10]:
class Decoder(keras.layers.Layer):
    def __init__(self, units, text_processor,**kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.text_processor = text_processor
        self.vocab = self.text_processor.get_vocabulary()
        
        self.embedding_layer = keras.layers.Embedding(input_dim=len(self.vocab), output_dim=self.units, mask_zero=True)
        self.rnn = keras.layers.LSTM(self.units, return_sequences=True, return_state=True)
        
        self.attention = CrossAttention()
        self.output_layer = keras.layers.Dense(len(self.vocab))
    
    def call(self, context, x):
        x = self.text_processor(x)
        embedding = self.embedding_layer(x)
        x, f_hidden_states, f_cell_states = self.rnn(embedding)
        x = self.attention(context, x)
        logits = self.output_layer(x)
        return logits
        

In [11]:
encoder = Encoder(16, context_vectorization)
decoder = Decoder(16, target_vectorization)

In [12]:
context = encoder(context[:5000])
logits = decoder(context, target[:5000])

2024-02-05 16:18:34.568177: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2900000000 exceeds 10% of free system memory.
2024-02-05 16:18:35.195490: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2900000000 exceeds 10% of free system memory.


In [15]:
logits

<tf.Tensor: shape=(5000, 29, 5000), dtype=float32, numpy=
array([[[ 0.01501542, -0.02215884,  0.03292631, ..., -0.00316633,
          0.03249658,  0.01084893],
        [-0.01315241, -0.02400041,  0.02878876, ..., -0.01211426,
          0.02269103,  0.01042926],
        [ 0.01843226, -0.00509004,  0.0372501 , ...,  0.00710234,
          0.02691102,  0.00563984],
        ...,
        [-0.00019369,  0.00490617,  0.02298385, ..., -0.00897576,
          0.01258669,  0.01089346],
        [-0.00019369,  0.00490617,  0.02298385, ..., -0.00897576,
          0.01258669,  0.01089346],
        [-0.00019369,  0.00490617,  0.02298385, ..., -0.00897576,
          0.01258669,  0.01089346]],

       [[ 0.01870219, -0.01824124,  0.02909059, ..., -0.01494344,
          0.03485141,  0.02876124],
        [ 0.00853089, -0.0220951 , -0.00837404, ..., -0.02269654,
          0.02480217,  0.03579467],
        [-0.0010935 , -0.01995786,  0.00864671, ..., -0.02249029,
          0.03681054,  0.02755852],
        .

In [18]:
np.array([[[1,2,3],[1,2,3]],[[1,2,3],[1,2,3]],[[1,2,3],[1,2,3]],[[1,2,3],[1,2,3]]]).shape

(4, 2, 3)