## Recreating the following for classification problem
https://www.tensorflow.org/text/tutorials/transformer

Need tensorflow-text for using the BERT model tokenizer

In [1]:
# Install the nightly version of TensorFlow to use the improved
# masking support for `tf.keras.layers.MultiHeadAttention`.
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q -U tensorflow-text tensorflow

[K     |████████████████████████████████| 5.9 MB 18.8 MB/s 
[K     |████████████████████████████████| 578.0 MB 16 kB/s 
[K     |████████████████████████████████| 438 kB 70.8 MB/s 
[K     |████████████████████████████████| 1.7 MB 51.3 MB/s 
[K     |████████████████████████████████| 5.9 MB 57.6 MB/s 
[?25h

Import the necessary modules:

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_text
from sklearn.utils import shuffle 

Doing necessary data preprocessing

In [3]:
os.listdir('.')

['.config', 'twitter_validation.csv', 'twitter_training.csv', 'sample_data']

In [4]:
training_data_loc = "./twitter_training.csv"
val_data_loc = "./twitter_validation.csv"

In [5]:
df = pd.read_csv(training_data_loc)
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [6]:
df = df[df.columns[2:]]

In [7]:
df = df.rename(columns={"Positive":"Sentiment", "im getting on borderlands and i will murder you all ,":"sentence"})

In [8]:
df['Sentiment'].value_counts()

Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: Sentiment, dtype: int64

In [9]:
df = df[df.apply(lambda x: (x["Sentiment"] == "Positive") or (x["Sentiment"] == "Negative"), axis=1)]

In [10]:
df['Sentiment'].value_counts()

Negative    22542
Positive    20831
Name: Sentiment, dtype: int64

In [11]:
def index_helper(x):
  if x == "Positive":
    return 1
  # if x == "Neutral":
  #   return 1
  return 2

In [12]:
df['Sentiment'] = df['Sentiment'].apply(lambda x: index_helper(x))

In [13]:
df = shuffle(df)

df

Unnamed: 0,Sentiment,sentence
30725,2,
73826,2,Fuck Microsoft and screw Nvidia I’m losing my ...
37897,1,Nothing has united the Hearthstone community m...
7840,1,K I does this skin
69801,1,THE WOW.
...,...,...
12358,1,<unk> i got fucking shoes bud.
20224,1,<unk>
33965,1,Both look great
42352,2,


In [14]:
df.isna().value_counts()

Sentiment  sentence
False      False       43012
           True          361
dtype: int64

In [15]:
df.dropna(inplace=True) # dropping empty sentences

In [17]:
df.to_csv("preprocessed_training_data.csv")

In [19]:
labels = df['Sentiment']
features = df['sentence']

## Tokenizer
using subword tokenizer implementation from Bert model

In [18]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{model_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip


'./ted_hrlr_translate_pt_en_converter.zip'

In [20]:
tokenizers = tf.saved_model.load(model_name)

The `tf.saved_model` contains two text tokenizers, one for English and one for Portuguese. We just need the English one

In [21]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

# **Tokenizer**
Cannot prepare batches as required. I don't know how to fix it.

In [22]:
MAX_TOKENS=128
def prepare_batch(en): # not actually preparing batches
    en = tokenizers.en.tokenize(en)
    en = en[:, :128] # pads with `.` if the sentence is less than 128 words and strips if more than 128 words
    # print(en)
    # en_inputs = en[:, :-1].to_tensor()
    return en

In [23]:
tokenized_sentences = prepare_batch(features) # batch_size, 128

In [24]:
new_features = tokenized_sentences.to_tensor()
new_features

<tf.Tensor: shape=(43012, 128), dtype=int64, numpy=
array([[   2,    3,    0, ...,    0,    0,    0],
       [   2,   42, 2713, ...,    0,    0,    0],
       [   2,  456,  144, ...,    0,    0,    0],
       ...,
       [   2,    1, 6363, ...,    0,    0,    0],
       [   2,  421,  176, ...,    0,    0,    0],
       [   2,   42, 2713, ...,    0,    0,    0]])>

## Copied code 
we need to change it up a bit so we can use same definitions for our own classification problem

### Define the Embedding and Positional encoding

In [25]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
  
  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

In [26]:
temp_encoding = positional_encoding(length=2048, depth=512)
temp_encoding

<tf.Tensor: shape=(2048, 512), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.84147096,  0.8218562 ,  0.8019618 , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.9092974 ,  0.9364147 ,  0.95814437, ...,  1.        ,
         1.        ,  1.        ],
       ...,
       [ 0.17589758, -0.18608274, -0.7070546 , ...,  0.9741639 ,
         0.97595036,  0.97761387],
       [-0.7333133 ,  0.7014913 ,  0.1447375 , ...,  0.9741387 ,
         0.97592694,  0.97759205],
       [-0.9683193 ,  0.98535496,  0.8799798 , ...,  0.9741135 ,
         0.9759035 ,  0.9775702 ]], dtype=float32)>

In [27]:
temp_encoding[tf.newaxis, :128, :].shape

TensorShape([1, 128, 512])

In [42]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x # (batch_size, length-of-signal, 512)


### random junk

In [None]:
tokenizers.en.get_vocab_size()

<tf.Tensor: shape=(), dtype=int32, numpy=7010>

In [None]:
temp_layer = tf.keras.layers.Embedding(7010, 512, mask_zero=True)
# help(temp_layer)

In [None]:
temp_model = tf.keras.Sequential()
temp_layer = tf.keras.layers.Embedding(1000, 64, input_length=10, mask_zero=True)
temp_model.add(temp_layer)
# The model will take as input an integer matrix of size (batch,
# input_length), and the largest integer (i.e. word index) in the input
# should be no larger than 999 (vocabulary size).
# Now model.output_shape is (None, 10, 64), where `None` is the batch
# dimension.
input_array = np.random.randint(1000, size=(1, 10))
temp_model.compile('rmsprop', 'mse')
output_array = temp_model.predict(input_array)
# print(output_array)
print(output_array.shape)
#(1, 10, 64)

(1, 10, 64)


In [None]:
# print(temp_layer.compute_mask(input_array, mask_zero=True)) # would not work as this is just for forwarding the compute_mask

In [None]:
embed_en = PositionalEmbedding(vocab_size=tokenizers.en.get_vocab_size(), d_model=512)
embed_en

<__main__.PositionalEmbedding at 0x7f5b1c16c910>

### Define the feed forward network

Define a function for the point-wise feed-forward network that you'll use later.

The network consists of two linear layers (`tf.keras.layers.Dense`) with a ReLU activation in-between:

In [43]:
def point_wise_feed_forward_network(
  d_model, # Input/output dimensionality.
  dff # Inner-layer dimensionality.
  ):

  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # Shape `(batch_size, seq_len, dff)`.
      tf.keras.layers.Dense(d_model)  # Shape `(batch_size, seq_len, d_model)`.
  ])

### Define the encoder layer

In [63]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*,
               d_model, # Input/output dimensionality.
               num_attention_heads,
               dff, # Inner-layer dimensionality.
               dropout_rate=0.1
               ):
    super().__init__()


    # Multi-head self-attention.
    self.mha = tf.keras.layers.MultiHeadAttention(
        num_heads=num_attention_heads,
        key_dim=d_model, # Size of each attention head for query Q and key K.
        dropout=dropout_rate,
        )
    # Point-wise feed-forward network.
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    # Layer normalization.
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    # Dropout for the point-wise feed-forward network.
    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask=None):

    # A boolean mask.
    # if mask is not None:
    #   mask1 = mask[:, :, None]
    #   mask2 = mask[:, None, :]
    #   attention_mask = mask1 & mask2
    # else:
    #   attention_mask = None
    # attention_mask = None

    # Multi-head self-attention output (`tf.keras.layers.MultiHeadAttention `).
    attn_output = self.mha(
        query=x,  # Query Q tensor.
        value=x,  # Value V tensor.
        key=x,  # Key K tensor.
        # attention_mask=attention_mask, # A boolean mask that prevents attention to certain positions.
        training=training, # A boolean indicating whether the layer should behave in training mode.
        )

    # Multi-head self-attention output after layer normalization and a residual/skip connection.
    out1 = self.layernorm1(x + attn_output)  # Shape `(batch_size, input_seq_len, d_model)`

    # Point-wise feed-forward network output.
    ffn_output = self.ffn(out1)  # Shape `(batch_size, input_seq_len, d_model)`
    ffn_output = self.dropout1(ffn_output, training=training)
    # Point-wise feed-forward network output after layer normalization and a residual skip connection.
    out2 = self.layernorm2(out1 + ffn_output)  # Shape `(batch_size, input_seq_len, d_model)`.

    return out2

In [64]:
sample_encoder_layer = EncoderLayer(d_model=512, num_attention_heads=8, dff=2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((2, 3, 512)), training=False, mask=None)

# Print the shape.
print(sample_encoder_layer_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

(2, 3, 512)


### Define the encoder

In [71]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self,
               *,
               num_layers,
               d_model, # Input/output dimensionality.
               num_attention_heads,
               dff, # Inner-layer dimensionality.
               input_vocab_size, # Input vocabulary size.
               dropout_rate=0.1
               ):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    # Embeddings + Positional encoding
    self.pos_embedding = PositionalEmbedding(input_vocab_size, d_model)

    # Encoder layers.
    self.enc_layers = [
        EncoderLayer(
          d_model=d_model,
          num_attention_heads=num_attention_heads,
          dff=dff,
          dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    # Dropout.
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.flatten_layer = tf.keras.layers.GlobalAveragePooling1D()
    self.dense_layer_1 = tf.keras.layers.Dense(256, activation='relu')
    self.dense_layer_2 = tf.keras.layers.Dense(32, activation='relu')
    self.final_layer = tf.keras.layers.Dense(1, activation='sigmoid')

  # Masking.
  # def compute_mask(self, x, previous_mask=None):
  #   return self.pos_embedding.compute_mask(x, previous_mask)

  def call(self, x, training):
    seq_len = tf.shape(x)[1]

    # Sum up embeddings and positional encoding.
    # mask = self.compute_mask(x)
    x = self.pos_embedding(x)  # Shape `(batch_size, input_seq_len, d_model)`.
    # Add dropout.
    x = self.dropout(x, training=training)
    # N encoder layers.
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training) #, mask)
    # experimental thing, may not work
    x = self.flatten_layer(x)
    x = self.dense_layer_1(x)
    x = self.dense_layer_2(x)
    # x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = self.final_layer(x)
    return x  # Shape `(batch_size, input_seq_len, d_model)`.

In [72]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(128)),
    Encoder(num_layers = 4, d_model = 128, dff = 512, num_attention_heads = 8, input_vocab_size=tokenizers.en.get_vocab_size())
])

In [73]:
input = np.random.rand(5, 128)
# print(input.shape)
# target = tf.constant([[1,2,3, 0]])
print(input.shape)
x = model((input))
x.shape

(5, 128)


TensorShape([5, 1])

In [74]:
model.summary()

Model: "sequential_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_5 (Encoder)         (None, 1)                 3577409   
                                                                 
Total params: 3,577,409
Trainable params: 3,577,409
Non-trainable params: 0
_________________________________________________________________


In [None]:
def custom_loss_func(label, pred):
  print(label.shape, pred.shape)
  return 1.0

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(axis=1)


In [None]:
y_true = [[0], [1], [0], [0]]
y_pred = [[0.6], [0.51], [0.94], [0.8]]

In [None]:
loss(y_true, y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=1.5031203>

In [83]:
# compiling the model
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(), 
    metrics=["accuracy"]
)

In [81]:
new_features.shape, labels.shape

(TensorShape([43012, 128]), (43012,))

In [77]:
new_labels = (labels -1).to_numpy().reshape(-1, 1)

In [78]:
new_labels.shape

(43012, 1)

### Training

In [85]:
# error!
history = model.fit(
    x=new_features,
    y=new_labels,
    epochs=20,
    batch_size=256,
    validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
  3/135 [..............................] - ETA: 50s - loss: nan - accuracy: 0.4792

KeyboardInterrupt: ignored

# Another copied code

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = PositionalEmbedding(vocab_size=vocab_size, d_model=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
maxlen = 128
vocab_size = tokenizers.en.get_vocab_size()
embed_dim = 512  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 256  # Hidden layer size in feed forward network inside transformer

inputs = tf.keras.layers.Input(shape=(maxlen,))
embedding_layer = PositionalEmbedding(vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(2, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
new_features

<tf.Tensor: shape=(43012, 128), dtype=int64, numpy=
array([[   2,   54, 3562, ...,    0,    0,    0],
       [   2,   81,   80, ...,    0,    0,    0],
       [   2,   55,  631, ...,    0,    0,    0],
       ...,
       [   2,   76,    9, ...,    0,    0,    0],
       [   2,   81,   80, ...,    0,    0,    0],
       [   2, 1186, 2367, ...,    0,    0,    0]])>

In [None]:
labels.shape

(43012,)

In [None]:
history = model.fit(
    new_features, labels, batch_size=32, epochs=4, validation_split=0.2
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
# model is shit

In [None]:
df = pd.read_csv(val_data_loc)
df

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [None]:
df = df[df.columns[2:]]

In [None]:
df = df.rename(columns={"Irrelevant":"Sentiment", "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣":"sentence"})
df

Unnamed: 0,Sentiment,sentence
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...
994,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,Positive,Today sucked so it’s time to drink wine n play...
997,Positive,Bought a fraction of Microsoft today. Small wins.


In [None]:
df = df[df.apply(lambda x: (x["Sentiment"] == "Positive") or (x["Sentiment"] == "Negative"), axis=1)]

In [None]:
df['Sentiment'] = df['Sentiment'].apply(lambda x: index_helper(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df.isna().value_counts()

Sentiment  sentence
False      False       543
dtype: int64

In [None]:
labels = df['Sentiment']
features = df['sentence']

In [None]:
test_features = prepare_batch(features)
test_features = test_features.to_tensor()
test_features

<tf.Tensor: shape=(543, 128), dtype=int64, numpy=
array([[   2,   31, 5759, ...,    0,    0,    0],
       [   2,   39,   88, ...,    0,    0,    0],
       [   2, 2265,   31, ...,    0,    0,    0],
       ...,
       [   2,  205,   92, ...,    0,    0,    0],
       [   2,  208,   55, ...,    0,    0,    0],
       [   2, 2013,   37, ...,    0,    0,    0]])>

In [None]:
results = model.evaluate(x=test_features, y=labels)



In [None]:
tokenizers.en.detokenize(test_features[:3].numpy())

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'@ microsoft why do i pay for word when it functions so poorly on my @ samsungus chromebook ? [UNK]',
       b"csgo matchmaking is so full of closet hacking , it ' s a truly awful game .",
       b'hi @ eahelp i \xe2\x80\x99 ve had madeleine mccann in my cellar for the past 13 years and the little sneaky thing just escaped whilst i was loading up some fifa points , she took my card and i \xe2\x80\x99 m having to use my paypal account but it isn \xe2\x80\x99 t working , can you help me resolve it please ?'],
      dtype=object)>

In [None]:
model(test_features[:3])

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[0.52231073, 0.47768924],
       [0.52231073, 0.47768924],
       [0.52231073, 0.47768924]], dtype=float32)>