## Recreating the following for classification problem
https://www.tensorflow.org/text/tutorials/transformer

Need tensorflow-text for using the BERT model tokenizer

In [2]:
# Install the nightly version of TensorFlow to use the improved
# masking support for `tf.keras.layers.MultiHeadAttention`.
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q -U tensorflow-text tensorflow

Import the necessary modules:

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_text
from sklearn.utils import shuffle 

Doing necessary data preprocessing

In [4]:
os.listdir('.')

['.config',
 'twitter_validation.csv',
 'twitter_training.csv',
 'ted_hrlr_translate_pt_en_converter.zip',
 'ted_hrlr_translate_pt_en_converter',
 'sample_data']

In [5]:
training_data_loc = "./twitter_training.csv"
val_data_loc = "./twitter_validation.csv"

In [6]:
df = pd.read_csv(training_data_loc)
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [7]:
df = df[df.columns[2:]]

In [8]:
df = df.rename(columns={"Positive":"Sentiment", "im getting on borderlands and i will murder you all ,":"sentence"})

In [9]:
df['Sentiment'].value_counts()

Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: Sentiment, dtype: int64

In [10]:
df = df[df.apply(lambda x: (x["Sentiment"] == "Positive") or (x["Sentiment"] == "Negative"), axis=1)]

In [11]:
df['Sentiment'].value_counts()

Negative    22542
Positive    20831
Name: Sentiment, dtype: int64

In [12]:
df['Sentiment'] = df['Sentiment'].apply(lambda x: 1 if x == "Positive" else 0)

In [13]:
df = shuffle(df)

df

Unnamed: 0,Sentiment,sentence
4646,0,SBMM is seriously ruining the fun gaming shoul...
11238,1,Can't wait to play
36923,0,
40633,0,"I have noticed that the streamers I watch, who..."
17784,1,Oh cool...
...,...,...
48730,1,As much when people don’t like the Amazon busi...
26039,1,Watching the leaked gameplay of . . It's not ...
4729,1,Amazon has the coolest shit I never thought I ...
34639,1,Rue it's so beautiful.....


In [14]:
df.isna().value_counts()

Sentiment  sentence
False      False       43012
           True          361
dtype: int64

In [15]:
df.dropna(inplace=True) # dropping empty sentences

In [16]:
df

Unnamed: 0,Sentiment,sentence
4646,0,SBMM is seriously ruining the fun gaming shoul...
11238,1,Can't wait to play
36923,0,
40633,0,"I have noticed that the streamers I watch, who..."
17784,1,Oh cool...
...,...,...
48730,1,As much when people don’t like the Amazon busi...
26039,1,Watching the leaked gameplay of . . It's not ...
4729,1,Amazon has the coolest shit I never thought I ...
34639,1,Rue it's so beautiful.....


In [17]:
labels = df['Sentiment']
features = df['sentence']

## Tokenizer
using subword tokenizer implementation from Bert model

In [18]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{model_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

'./ted_hrlr_translate_pt_en_converter.zip'

In [19]:
tokenizers = tf.saved_model.load(model_name)

The `tf.saved_model` contains two text tokenizers, one for English and one for Portuguese. We just need the English one

In [20]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

In [21]:
sample = features.sample(n=5)
sample

57799    @ Rainbow6Game Please FIX YOUR GAME! I'm from ...
11023    August 14. There are still no release dates or...
32244    Ruining my life by learning... Twitch.tv / quirky
71250    @ GhostRecon Almost all carry handles for back...
40170    @ EA _ DICE Please make a mobile version of Ba...
Name: sentence, dtype: object

In [22]:
text_sample = sample.iloc[0]
print(text_sample)

@ Rainbow6Game Please FIX YOUR GAME! I'm from PlayStation the servers are so gy!


In [23]:
encoded = tokenizers.en.tokenize(sample)
# print(encoded)
print('Padded and encoded embeddings:')
for row in encoded.to_list():
  print(row)

Padded and encoded embeddings:
[2, 31, 1470, 6401, 1163, 5140, 1572, 1102, 1381, 135, 1112, 4, 45, 9, 49, 109, 3372, 2982, 71, 6307, 88, 86, 82, 43, 240, 4, 3]
[2, 4349, 1850, 15, 96, 86, 235, 154, 1903, 4658, 113, 3669, 15, 2171, 1745, 74, 296, 95, 119, 157, 193, 2301, 3456, 4312, 3746, 2418, 470, 15, 154, 297, 13, 6762, 1722, 2096, 85, 71, 599, 14, 130, 15, 15, 1765, 261, 14, 1004, 1947, 310, 4, 15, 31, 3372, 2982, 31, 60, 3896, 138, 230, 3]
[2, 54, 6013, 1547, 99, 183, 118, 507, 15, 15, 15, 56, 1325, 5514, 15, 1433, 16, 53, 6013, 5180, 240, 3]
[2, 31, 43, 5502, 773, 966, 6407, 384, 101, 1808, 2664, 88, 85, 193, 5540, 88, 86, 262, 4414, 13, 172, 190, 375, 142, 2242, 13, 2221, 79, 86, 349, 95, 6157, 81, 77, 71, 1149, 323, 13, 8, 166, 45, 127, 106, 73, 125, 121, 74, 116, 142, 773, 1640, 75, 810, 79, 73, 102, 6539, 135, 193, 5540, 73, 71, 193, 74, 71, 142, 773, 15, 3]
[2, 31, 41, 303, 35, 40, 3168, 1102, 161, 37, 1148, 1695, 74, 2312, 6438, 21, 13, 76, 127, 97, 71, 1345, 1148, 43, 2199,

The padded 2 and 3 stands for [START] and [END]

In [25]:
encoded[:, :128]

<tf.RaggedTensor [[2, 31, 1470, 6401, 1163, 5140, 1572, 1102, 1381, 135, 1112, 4, 45, 9, 49,
  109, 3372, 2982, 71, 6307, 88, 86, 82, 43, 240, 4, 3]                    ,
 [2, 4349, 1850, 15, 96, 86, 235, 154, 1903, 4658, 113, 3669, 15, 2171,
  1745, 74, 296, 95, 119, 157, 193, 2301, 3456, 4312, 3746, 2418, 470, 15,
  154, 297, 13, 6762, 1722, 2096, 85, 71, 599, 14, 130, 15, 15, 1765, 261,
  14, 1004, 1947, 310, 4, 15, 31, 3372, 2982, 31, 60, 3896, 138, 230, 3]  ,
 [2, 54, 6013, 1547, 99, 183, 118, 507, 15, 15, 15, 56, 1325, 5514, 15,
  1433, 16, 53, 6013, 5180, 240, 3]                                    ,
 [2, 31, 43, 5502, 773, 966, 6407, 384, 101, 1808, 2664, 88, 85, 193, 5540,
  88, 86, 262, 4414, 13, 172, 190, 375, 142, 2242, 13, 2221, 79, 86, 349,
  95, 6157, 81, 77, 71, 1149, 323, 13, 8, 166, 45, 127, 106, 73, 125, 121,
  74, 116, 142, 773, 1640, 75, 810, 79, 73, 102, 6539, 135, 193, 5540, 73,
  71, 193, 74, 71, 142, 773, 15, 3]                                        ,
 [2, 31, 4

In [26]:
round_trip = tokenizers.en.detokenize(encoded[:, :128])

print('Original text:')
for line in round_trip.numpy():
  print(line.decode('utf-8'))

Original text:
@ rainbow6game please fix your game ! i ' m from playstation the servers are so gy !
august 14 . there are still no release dates or prices . hidden lack of information about how well backward compatibility works . no real , informative videos for the line - up . . worst next - generation marketing ever ! . @ playstation @ xboxinge
ruining my life by learning . . . twitch . tv / quirky
@ ghostrecon almost all carry handles for backpacks are too fluid , even most light vests , hopefully you are thinking about fixing this in the near future , & also i would like to see more of these vest types that allow you to attach your backpack to the back of the vest .
@ ea _ dice please make a mobile version of battlefield 4 , it would be the greatest mobile gaming has ever achieved


## Set up a data pipeline with `tf.data`

The following function takes batches of text as input, and converts them to a format suitable for training. 

1. It tokenizes them into ragged batches.
2. It trims each to be no longer than `MAX_TOKENS`.
3. It splits the output (English) tokens into inputs and labels. THese shifted by one step so that the `label` at each location is the id of the next token.
4. It converts the `RaggedTensor`s to padded dense `Tensor`s.
5. It returns an `(inputs, labels)` pair.

NOTE: We will bypass this method in the batch making area and utilize our own method since we are not using `tf.data`

# **Problem here!!!**
Cannot prepare batches as required. I don't know how to fix it.

In [27]:
MAX_TOKENS=128
def prepare_batch(en): # not actually preparing batches
    en = tokenizers.en.tokenize(en)
    en = en[:, :(MAX_TOKENS+1)] # pads with `.` if the sentence is less than 128 words and strips if more than 128 words
    # print(en)
    en_inputs = en[:, :-1].to_tensor()
    return en

In [28]:
tokenized_sentences = prepare_batch(features)

In [29]:
list_tokenized = tokenized_sentences.to_list()

In [30]:
len(list_tokenized), len(labels)

(43012, 43012)

In [31]:
for i in range(len(list_tokenized)-43000):
  print(list_tokenized[i], labels.iloc[i])

[2, 55, 6399, 504, 80, 2277, 54, 6013, 1547, 71, 994, 43, 2199, 138, 234, 97, 15, 192, 86, 1213, 343, 77, 4116, 138, 92, 5320, 138, 81, 30, 4, 15, 15, 745, 89, 76, 100, 37, 43, 4096, 113, 93, 1113, 3826, 42, 1192, 88, 15, 3] 0
[2, 94, 9, 56, 1009, 73, 528, 3] 1
[2, 3] 0
[2, 45, 89, 1486, 75, 71, 3694, 343, 45, 732, 13, 136, 110, 528, 2663, 155, 171, 2312, 6438, 58, 13, 86, 121, 105, 2626, 6572, 72, 166, 89, 37, 241, 5763, 15, 3] 0
[2, 525, 875, 15, 15, 15, 3] 1
[2, 107, 271, 15, 56, 271, 16, 19, 5949, 4562, 684, 677, 107, 271, 15, 56, 271, 16, 20, 303, 1223, 787, 504, 1374, 484, 90, 71, 115, 1283, 89, 83, 317, 73, 71, 6273, 43, 2365, 92, 71, 5840, 56, 1548, 3916, 258, 30, 81, 80, 192, 91, 269, 9, 56, 106, 85, 271, 4568, 1013, 13, 83, 3499, 93, 167, 75, 86, 269, 9, 56, 1528, 15, 107, 271, 15, 56, 271, 16, 20, 303, 1223, 787, 504, 1374, 484, 3] 0
[2, 31, 41, 303, 31, 41, 2489, 5934, 269, 1192, 392, 3310, 99, 55, 4895, 2199, 230, 6237, 3917, 80, 51, 1699, 1012, 1240, 3] 0
[2, 202, 79, 31,

## Copied code 
we need to change it up a bit so we can use same definitions for our own classification problem

### Define the Embedding and Positional encoding

In [32]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
  
  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

In [33]:
temp_encoding = positional_encoding(length=2048, depth=512)

In [34]:
temp_encoding[tf.newaxis, :128, :].shape

TensorShape([1, 128, 512])

In [35]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x # (batch_size, length-of-signal, 512)


In [36]:
x = np.zeros((400, 128, 512))
y = np.zeros((1, 128, 512))

In [38]:
(x + y).shape

(400, 128, 512)

In [39]:
tf.cast(512, tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=512.0>

In [40]:
tokenizers.en.get_vocab_size()

<tf.Tensor: shape=(), dtype=int32, numpy=7010>

In [42]:
temp_layer = tf.keras.layers.Embedding(7010, 512, mask_zero=True)
# help(temp_layer)

In [43]:
temp_model = tf.keras.Sequential()
temp_layer = tf.keras.layers.Embedding(1000, 64, input_length=10, mask_zero=True)
temp_model.add(temp_layer)
# The model will take as input an integer matrix of size (batch,
# input_length), and the largest integer (i.e. word index) in the input
# should be no larger than 999 (vocabulary size).
# Now model.output_shape is (None, 10, 64), where `None` is the batch
# dimension.
input_array = np.random.randint(1000, size=(1, 10))
temp_model.compile('rmsprop', 'mse')
output_array = temp_model.predict(input_array)
# print(output_array)
print(output_array.shape)
#(1, 10, 64)

(1, 10, 64)


In [66]:
# print(temp_layer.compute_mask(input_array, mask_zero=True)) # would not work as this is just for forwarding the compute_mask

In [44]:
embed_en = PositionalEmbedding(vocab_size=tokenizers.en.get_vocab_size(), d_model=512)
embed_en

<__main__.PositionalEmbedding at 0x7fd55d6e60d0>

In [65]:
encoded.to_tensor()

<tf.Tensor: shape=(5, 69), dtype=int64, numpy=
array([[   2,   31, 1470, 6401, 1163, 5140, 1572, 1102, 1381,  135, 1112,
           4,   45,    9,   49,  109, 3372, 2982,   71, 6307,   88,   86,
          82,   43,  240,    4,    3,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   2, 4349, 1850,   15,   96,   86,  235,  154, 1903, 4658,  113,
        3669,   15, 2171, 1745,   74,  296,   95,  119,  157,  193, 2301,
        3456, 4312, 3746, 2418,  470,   15,  154,  297,   13, 6762, 1722,
        2096,   85,   71,  599,   14,  130,   15,   15, 1765,  261,   14,
        1004, 1947,  310,    4,   15,   31, 3372, 2982,   31,   60, 3896,
         138,  230,    3,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   

In [60]:
# This is where the error pops up. I assume it is because of the way we are preparing batches.

# embed_en(encoded)
embed_en(encoded.to_tensor()).shape

TensorShape([5, 69, 512])

### Define the feed forward network

Define a function for the point-wise feed-forward network that you'll use later.

The network consists of two linear layers (`tf.keras.layers.Dense`) with a ReLU activation in-between:

In [62]:
def point_wise_feed_forward_network(
  d_model, # Input/output dimensionality.
  dff # Inner-layer dimensionality.
  ):

  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # Shape `(batch_size, seq_len, dff)`.
      tf.keras.layers.Dense(d_model)  # Shape `(batch_size, seq_len, d_model)`.
  ])

### Define the encoder layer

In [63]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*,
               d_model, # Input/output dimensionality.
               num_attention_heads,
               dff, # Inner-layer dimensionality.
               dropout_rate=0.1
               ):
    super().__init__()


    # Multi-head self-attention.
    self.mha = tf.keras.layers.MultiHeadAttention(
        num_heads=num_attention_heads,
        key_dim=d_model, # Size of each attention head for query Q and key K.
        dropout=dropout_rate,
        )
    # Point-wise feed-forward network.
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    # Layer normalization.
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    # Dropout for the point-wise feed-forward network.
    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):

    # A boolean mask.
    if mask is not None:
      mask1 = mask[:, :, None]
      mask2 = mask[:, None, :]
      attention_mask = mask1 & mask2
    else:
      attention_mask = None

    # Multi-head self-attention output (`tf.keras.layers.MultiHeadAttention `).
    attn_output = self.mha(
        query=x,  # Query Q tensor.
        value=x,  # Value V tensor.
        key=x,  # Key K tensor.
        attention_mask=attention_mask, # A boolean mask that prevents attention to certain positions.
        training=training, # A boolean indicating whether the layer should behave in training mode.
        )

    # Multi-head self-attention output after layer normalization and a residual/skip connection.
    out1 = self.layernorm1(x + attn_output)  # Shape `(batch_size, input_seq_len, d_model)`

    # Point-wise feed-forward network output.
    ffn_output = self.ffn(out1)  # Shape `(batch_size, input_seq_len, d_model)`
    ffn_output = self.dropout1(ffn_output, training=training)
    # Point-wise feed-forward network output after layer normalization and a residual skip connection.
    out2 = self.layernorm2(out1 + ffn_output)  # Shape `(batch_size, input_seq_len, d_model)`.

    return out2

In [67]:
sample_encoder_layer = EncoderLayer(d_model=512, num_attention_heads=8, dff=2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((2, 3, 512)), training=False, mask=None)

# Print the shape.
print(sample_encoder_layer_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

(2, 3, 512)


### Define the encoder

In [78]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self,
               *,
               num_layers,
               d_model, # Input/output dimensionality.
               num_attention_heads,
               dff, # Inner-layer dimensionality.
               input_vocab_size, # Input (Portuguese) vocabulary size.
               dropout_rate=0.1
               ):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    # Embeddings + Positional encoding
    self.pos_embedding = PositionalEmbedding(input_vocab_size, d_model)

    # Encoder layers.
    self.enc_layers = [
        EncoderLayer(
          d_model=d_model,
          num_attention_heads=num_attention_heads,
          dff=dff,
          dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    # Dropout.
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  # Masking.
  def compute_mask(self, x, previous_mask=None):
    return self.pos_embedding.compute_mask(x, previous_mask)

  def call(self, x, training):

    seq_len = tf.shape(x)[1]

    # Sum up embeddings and positional encoding.
    mask = self.compute_mask(x)
    x = self.pos_embedding(x)  # Shape `(batch_size, input_seq_len, d_model)`.
    # Add dropout.
    x = self.dropout(x, training=training)

    # N encoder layers.
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    # experimental thing, may not work
    x = tf.keras.layers.Dense(256, activation='sigmoid')(x)
    x = tf.keras.layers.Dense(32, activation='sigmoid')(x)
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    return x  # Shape `(batch_size, input_seq_len, d_model)`.

In [80]:
# Instantiate the encoder.
sample_encoder = Encoder(
    num_layers=8,
    d_model=512,
    num_attention_heads=8,
    dff=2048,
    input_vocab_size=8500)

# Set the test input.
sample_encoder_output = sample_encoder(encoded.to_tensor(),
                                       training=False)

# Print the shape.
print(encoded.to_tensor().shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

(5, 69)
(5, 69, 1)


In [100]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(129)),
    sample_encoder
])

In [101]:
model.summary()

Model: "sequential_35"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_8 (Encoder)         multiple                  88377344  
                                                                 
Total params: 88,377,344
Trainable params: 88,377,344
Non-trainable params: 0
_________________________________________________________________


In [102]:
[item for item in dir(model) if not item.startswith('_')]

['activity_regularizer',
 'add',
 'add_loss',
 'add_metric',
 'add_update',
 'add_variable',
 'add_weight',
 'build',
 'built',
 'call',
 'compile',
 'compiled_loss',
 'compiled_metrics',
 'compute_dtype',
 'compute_loss',
 'compute_mask',
 'compute_metrics',
 'compute_output_shape',
 'compute_output_signature',
 'count_params',
 'distribute_strategy',
 'dtype',
 'dtype_policy',
 'dynamic',
 'evaluate',
 'evaluate_generator',
 'finalize_state',
 'fit',
 'fit_generator',
 'from_config',
 'get_config',
 'get_input_at',
 'get_input_mask_at',
 'get_input_shape_at',
 'get_layer',
 'get_output_at',
 'get_output_mask_at',
 'get_output_shape_at',
 'get_weight_paths',
 'get_weights',
 'history',
 'inbound_nodes',
 'input',
 'input_mask',
 'input_names',
 'input_shape',
 'input_spec',
 'inputs',
 'layers',
 'load_weights',
 'losses',
 'make_predict_function',
 'make_test_function',
 'make_train_function',
 'metrics',
 'metrics_names',
 'name',
 'name_scope',
 'non_trainable_variables',
 'non_tra

In [98]:
model.input_shape

(None, 128)

In [99]:
tokenized_sentences.to_tensor().shape # iske batches bnane hain...

TensorShape([43012, 129])