# Text classification with Transformer

**Author:** [Apoorv Nandan](https://twitter.com/NandanApoorv)<br>
**Date created:** 2020/05/10<br>
**Last modified:** 2024/01/18<br>
**Description:** Implement a Transformer block as a Keras layer and use it for text classification.

## Setup

In [1]:
import keras
from keras import ops
from keras import layers

## Implement a Transformer block as a layer

In [2]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

## Implement embedding layer

Two  embedding layers, one for tokens, one for token index (positions).

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

## Download and prepare dataset

In [4]:
import pandas as pd
import re
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk import PorterStemmer
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()

ps = PorterStemmer()

en_stopwords = set(stopwords.words("english"))
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in en_stopwords]
    return ' '.join(filtered_text)

df = pd.read_csv("./SMSCollection.txt", sep='\t')
df.columns=["type", "content"]
cleaned_text = []
for text in df.content:
    text  = text.lower()
    text  = re.sub('[^a-zA-Z!\?]', ' ', text)
    text  = re.sub('\s{2,}', ' ', text)
    text  = remove_stopwords(text)
    words = [ps.stem(word) for word in word_tokenize(text)]
    text  = TreebankWordDetokenizer().detokenize(words)
    cleaned_text.append(text)

df['content'] = cleaned_text

df.replace('', np.nan, inplace=True)
df.replace(' ', np.nan, inplace=True)
df.dropna(axis=0,inplace=True)
df.drop_duplicates(inplace=True)
df['type'] = lbl.fit_transform(df['type'])
df.head()

Unnamed: 0,type,content
0,0,ok lar joke wif u oni
1,1,free entri wkli comp win fa cup final tkt st m...
2,0,u dun say earli hor u c alreadi say
3,0,nah think goe usf live around though
4,1,freemsg hey darl week word back! like fun stil...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5062 entries, 0 to 5570
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   type     5062 non-null   int32 
 1   content  5062 non-null   object
dtypes: int32(1), object(1)
memory usage: 98.9+ KB


In [6]:
lbl.classes_

array(['ham', 'spam'], dtype=object)

In [7]:
maxlen = max([len(word_tokenize(i)) for i in list(df.content)])
maxlen

77

In [8]:
list(df.content)[0]

'ok lar joke wif u oni'

In [9]:
vocab_size = 20000  # Only consider the top 20k words

from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

tokenizer = Tokenizer(num_words=vocab_size, oov_token='OOV')
tokenizer.fit_on_texts(df.content)


tokenizer_path = 'tokenizer.json'
tokenizer_json = tokenizer.to_json()
with open(tokenizer_path, 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

In [10]:
tokenized_content =pad_sequences(tokenizer.texts_to_sequences(df.content), maxlen=maxlen, padding='post', truncating='post')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(tokenized_content,list(df.type),test_size=0.1)

print(len(X_train), "Training sequences")
print(len(X_test), "Validation sequences")

4555 Training sequences
507 Validation sequences


## Create classifier model using transformer layer

Transformer layer outputs one vector for each time step of our input sequence.
Here, we take the mean across all time steps and
use a feed forward network on top of it to classify text.

In [12]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)




## Train and Evaluate

In [13]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    np.array(X_train), np.array(y_train), batch_size=50, epochs=5, validation_data=(np.array(X_test), np.array(y_test))
)

Epoch 1/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - accuracy: 0.8523 - loss: 0.3839 - val_accuracy: 0.8935 - val_loss: 0.2073
Epoch 2/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.9020 - loss: 0.2187 - val_accuracy: 0.9744 - val_loss: 0.0922
Epoch 3/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.9734 - loss: 0.0967 - val_accuracy: 0.9921 - val_loss: 0.0445
Epoch 4/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.9932 - loss: 0.0247 - val_accuracy: 0.9822 - val_loss: 0.0469
Epoch 5/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.9970 - loss: 0.0160 - val_accuracy: 0.9941 - val_loss: 0.0270


In [14]:
model.save('transformer.keras')

In [15]:
print(f'X_train shape: {np.array(X_train).shape}')
print(f'y_train shape: {np.array(y_train).shape}')
print(f'X_test shape: {np.array(X_test).shape}')
print(f'y_test shape: {np.array(y_test).shape}')


X_train shape: (4555, 77)
y_train shape: (4555,)
X_test shape: (507, 77)
y_test shape: (507,)


In [16]:
np.argmax(model(X_test)[0])

0

In [17]:
y_test[0]

0