In [1]:
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from tensorflow.keras.saving import register_keras_serializable
import pandas as pd
import re
import string

In [2]:
dataset = pd.read_csv(os.path.join('dataset', 'IMDB Dataset.csv'))
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
split = int(len(dataset['review']) * 0.8)
train_data = dataset[: split]
valid_data = dataset[split: ]
batch_size = 32
seed = 42
sequence_length = 1500
vocab_size = 10000

In [4]:
@register_keras_serializable()
def custom_standard(inp):
    lower = tf.strings.lower(inp)
    lower = tf.strings.regex_replace(lower, '<br />', ' ')
    return tf.strings.regex_replace(lower,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize = custom_standard,
    max_tokens = vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)

vectorize_layer.adapt(dataset['review'])

In [5]:
def create_data(inp, label):
    label = tf.where(label == 'positive', 1, 0)
    inp = vectorize_layer(inp)

    return inp, label

In [6]:
train_Data = tf.data.Dataset.from_tensor_slices(
    (list(train_data['review']), list(train_data['sentiment']))
)
valid_Data = tf.data.Dataset.from_tensor_slices(
    (list(valid_data['review']), list(list(valid_data['sentiment'])))
)

valid_Data = (
    valid_Data.map(create_data, num_parallel_calls = tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
    .padded_batch(batch_size = batch_size)
)
train_Data = (
    train_Data.map(create_data, num_parallel_calls = tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
    .padded_batch(batch_size = batch_size)
)

In [7]:
for batch in train_Data.take(1):
    text, label = batch[0][0], batch[1][1]
    # label = tf.strings.reduce_join(label).numpy().decode('utf-8')
    print(label)
    print(text)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([28  5  2 ...  0  0  0], shape=(1500,), dtype=int64)


In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(vectorize_layer.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy']
)

model.summary()

In [9]:
history = model.fit(train_Data, epochs=10, validation_data=valid_Data)

Epoch 1/10
[1m   6/1250[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24:13[0m 1s/step - accuracy: 0.5451 - loss: 0.6937

KeyboardInterrupt: 

In [10]:
prediction_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    tf.keras.layers.Activation('sigmoid')
])

prediction_model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy'])

In [9]:
sen = tf.constant(["The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."])
print(model.predict(sen))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[[0.64604044]
 [0.36157236]
 [0.37635726]]


In [6]:
@register_keras_serializable()
def custom_standard(inp):
    lower = tf.strings.lower(inp)
    lower = tf.strings.regex_replace(lower, '<br />', ' ')
    return tf.strings.regex_replace(lower,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [7]:
model = tf.keras.models.load_model(r"D:\NLP\text_classifiaction\models\prediction_model.keras", custom_objects = {"custom_standard": custom_standard})

  saveable.load_own_variables(weights_store.get(inner_path))
