In [17]:
import tensorflow as tf
import pandas as pd
import re
import string
from tensorflow.keras.utils import register_keras_serializable

In [2]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def custom_standard(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '<br />', ' ')
    return tf.strings.regex_replace(text, '[%s]' % re.escape(string.punctuation), '')

sequnece_length = 1500
vocab_size = 10000
split = int((len(dataset['sentiment'])) * 0.8)

train_data = dataset[: split]
val_data = dataset[split: ]

In [4]:
vector_layer = tf.keras.layers.TextVectorization(max_tokens = vocab_size, output_sequence_length = sequnece_length, standardize = custom_standard, output_mode = 'int')
vector_layer.adapt(dataset['review'])

In [5]:
def create_dataset(sen, label):
    label = tf.where(label == 'positive', 1, 0)
    sen = vector_layer(sen)

    return sen, label

In [6]:
train_ds = tf.data.Dataset.from_tensor_slices((list(train_data['review']), list(train_data['sentiment'])))
val_ds = tf.data.Dataset.from_tensor_slices((list(val_data['review']), list(val_data['sentiment'])))

train_ds = (
    train_ds.map(create_dataset, num_parallel_calls = tf.data.AUTOTUNE)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)
val_ds = (
    val_ds.map(create_dataset, num_parallel_calls = tf.data.AUTOTUNE)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)

In [7]:
for s, l in train_ds.take(1):
    sen = s[0, : ].numpy()
    lab = l[0]
    vocab = vector_layer.get_vocabulary()
    text = [vocab[i] for i in sen]
    text_tensor = tf.constant(text)
    print(tf.strings.reduce_join(text_tensor, separator = ' ').numpy().decode('utf-8'))
    print(lab)

one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and [UNK] scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or [UNK] this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the [UNK] given to the [UNK] maximum security state [UNK] it focuses mainly on [UNK] city an experimental section of the prison where all the cells have glass [UNK] and face [UNK] so [UNK] is not high on the agenda em city is home to [UNK] muslims [UNK] [UNK] christians italians irish and [UNK] [UNK] death stares dodgy [UNK] and shady [UNK] are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictures painted for mainstream audiences forget 

In [8]:
class Transformer(tf.keras.layers.Layer):
    def __init__(self, eb_dim, num_head, nue_num, rate = 0.1):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads = num_head, key_dim = eb_dim)
        self.feed_fwd = tf.keras.Sequential([
            tf.keras.layers.Dense(nue_num, activation = 'relu'),
            tf.keras.layers.Dense(eb_dim)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs):
        att_out = self.att(inputs, inputs)
        att_out = self.dropout1(att_out)
        out = self.layernorm1(inputs + att_out)
        dense_out = self.feed_fwd(out)
        dense_out = self.dropout2(dense_out)
        return self.layernorm2(dense_out + out)

In [9]:
class embeding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, eb_dim):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = eb_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim = max_len, output_dim = eb_dim)

    def call(self, x):
        max_len = tf.keras.ops.shape(x)[-1]
        pos = tf.keras.ops.arange(start = 0, stop = max_len, step = 1)
        pos = self.pos_emb(pos)
        x = self.token_emb(x)

        return x + pos

In [10]:
inp = tf.keras.layers.Input(shape = (1500, ))
embeding_layer = embeding(1500, vocab_size, 32)
x = embeding_layer(inp)
transformer_blk = Transformer(32, 2, 31)
x = transformer_blk(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation = 'relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
out = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(inputs = inp, outputs = out)
model.summary()

In [12]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy']
)
history = model.fit(
    train_ds, batch_size = 32, epochs = 10, validation_data = val_ds
)

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 64ms/step - accuracy: 0.5051 - loss: 0.6976 - val_accuracy: 0.4995 - val_loss: 0.6864
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 65ms/step - accuracy: 0.5035 - loss: 0.6866 - val_accuracy: 0.5070 - val_loss: 0.6742
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 66ms/step - accuracy: 0.5576 - loss: 0.6502 - val_accuracy: 0.8514 - val_loss: 0.3296
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 65ms/step - accuracy: 0.8702 - loss: 0.3140 - val_accuracy: 0.8859 - val_loss: 0.2652
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 69ms/step - accuracy: 0.9028 - loss: 0.2453 - val_accuracy: 0.8991 - val_loss: 0.2531
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 68ms/step - accuracy: 0.9181 - loss: 0.2144 - val_accuracy: 0.9004 - val_loss: 0.2506
E

In [13]:
prediction_model = tf.keras.Sequential([
    vector_layer,
    model,
    tf.keras.layers.Activation('sigmoid')
])

In [15]:
sen = tf.constant(["The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."])
print(prediction_model.predict(sen))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[[0.9980768 ]
 [0.00156766]
 [0.00142235]]


In [16]:
prediction_model.save('transformer_model.keras')

In [None]:
@register_keras_serializable()
def custom_standard(inp):
    lower = tf.strings.lower(inp)
    lower = tf.strings.regex_replace(lower, '<br />', ' ')
    return tf.strings.regex_replace(lower,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [None]:
model_loaded = tf.keras.models.load_model(r"D:\NLP\text_classifiaction\models\prediction_model.keras", custom_objects = {"custom_standard": custom_standard})