In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [2]:
imdb_train, ds_info = tfds.load(name="imdb_reviews", split="train", 
                                with_info=True, as_supervised=True)
imdb_test = tfds.load(name="imdb_reviews", split="test", 
                      as_supervised=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteT2TUA9/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteT2TUA9/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteT2TUA9/imdb_reviews-unsupervised.tfrec…

Dataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [3]:
train_texts = []
train_labels = []
for example,label in imdb_train:
  train_texts.append(example.numpy().decode('utf-8'))
  train_labels.append(label.numpy())

In [4]:
test_texts = []
test_labels = []

for example,label in imdb_test:
  test_texts.append(example.numpy().decode('utf-8'))
  test_labels.append(label.numpy())

In [5]:
train_texts = np.array(train_texts)
train_labels = np.array(train_labels)

In [6]:
from sklearn.metrics import accuracy_score , precision_recall_fscore_support
def calculate_results(y_true,y_pred):
  model_accuracy = accuracy_score(y_true,y_pred) * 100
  model_prec,model_recall,model_f1, _ = precision_recall_fscore_support(y_true,y_pred,average="weighted")
  model_results = {
      "accuracy":model_accuracy,
      "precision":model_prec,
      "recall":model_recall,
      "f1":model_f1
  }
  return model_results

####NAIVE BAYES

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [8]:
model_nb = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('multinb',MultinomialNB())
])
model_nb.fit(train_texts,train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('multinb', MultinomialNB())])

In [9]:
y_train_pred = model_nb.predict(train_texts)
print("training scores",'-'*80)
print(calculate_results(train_labels,y_train_pred))
y_test_pred = model_nb.predict(test_texts)
print("testing scores",'-'*80)
print(calculate_results(test_labels,y_test_pred))

training scores --------------------------------------------------------------------------------
{'accuracy': 90.892, 'precision': 0.9096045205889325, 'recall': 0.90892, 'f1': 0.908881931454117}
testing scores --------------------------------------------------------------------------------
{'accuracy': 82.956, 'precision': 0.8343036261758116, 'recall': 0.82956, 'f1': 0.828953229782028}


###UTILS

In [10]:
import os
import datetime
def create_tf_callback(save_dir):
  dt = datetime.datetime.today().strftime("%Y-%b-%d-%H-%M-%S")
  checkpoint_dir = './training_checkpoints/' + save_dir + '/' + dt
  print(checkpoint_dir)
  checkpoint_prefix = os.path.join(checkpoint_dir,"checkpoint_epoch")
  checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)
  return checkpoint_callback

In [11]:
MAX_LENGTH = 150
MAX_VOCAB_LENGTH = 50000
OUTPUT_DIM = 128

In [12]:
from tensorflow.keras.layers import TextVectorization
text_vectorizer = TextVectorization(output_mode="int",
                                    max_tokens=MAX_VOCAB_LENGTH,
                                    output_sequence_length=MAX_LENGTH)
text_vectorizer.adapt(train_texts)
vocabulary = text_vectorizer.get_vocabulary()

In [13]:
embedding = tf.keras.layers.Embedding(input_dim=MAX_VOCAB_LENGTH,input_length=MAX_LENGTH,output_dim=OUTPUT_DIM)

###Simple Dense Model

In [None]:
from tensorflow.keras import layers

In [None]:
inputs = layers.Input(shape=(1,),dtype="string")
text_vectors = text_vectorizer(inputs)
text_embeddings = embedding(text_vectors)
x = tf.keras.layers.Flatten()(text_embeddings)
outputs = tf.keras.layers.Dense(1,activation="sigmoid")(x)
model_simple_dense = tf.keras.Model(inputs,outputs,name="simple_dense_model")

In [None]:
model_simple_dense.summary()

Model: "simple_dense_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 150)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 150, 128)          6400000   
                                                                 
 flatten (Flatten)           (None, 19200)             0         
                                                                 
 dense (Dense)               (None, 1)                 19201     
                                                                 
Total params: 6,419,201
Trainable params: 6,419,201
Non-trainable params: 0
______________________________________

In [None]:
model_simple_dense.compile(loss="binary_crossentropy",optimizer=tf.keras.optimizers.Adam(),metrics=["accuracy"])

In [None]:
model_simple_dense.fit(x=train_texts,y=train_labels,epochs=1,callbacks=[create_tf_callback("simple_dense_model")])

./training_checkpoints/simple_dense_model/2022-Nov-17-11-50-00


<keras.callbacks.History at 0x7fd2341b7290>

In [None]:
y_simple_dense_preds_test = model_simple_dense.predict(test_texts)
y_simple_dense_preds_test = tf.squeeze(tf.round(y_simple_dense_preds_test),axis=1)



In [None]:
y_simple_dense_preds_train = model_simple_dense.predict(train_texts)
y_simple_dense_preds_train = tf.squeeze(tf.round(y_simple_dense_preds_train),axis=1)



In [None]:
print("training scores",'-'*80)
print(calculate_results(train_labels,y_simple_dense_preds_train))
print("testing scores",'-'*80)
print(calculate_results(test_labels,y_simple_dense_preds_test))

training scores --------------------------------------------------------------------------------
{'accuracy': 95.756, 'precision': 0.9580081327445896, 'recall': 0.95756, 'f1': 0.9575496162286057}
testing scores --------------------------------------------------------------------------------
{'accuracy': 84.576, 'precision': 0.8464209512210831, 'recall': 0.84576, 'f1': 0.8456863946323556}


###LSTM

In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype="string")
vectors = text_vectorizer(inputs)
embeddings = embedding(vectors)
x = layers.Bidirectional(layers.LSTM(128))(embeddings)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_lstm = tf.keras.Model(inputs,outputs)


In [None]:
model_lstm.compile(metrics=['accuracy'],loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam())

In [None]:
model_lstm.fit(x=train_texts,y=train_labels,epochs=5,callbacks=[create_tf_callback("lstm_model")])

./training_checkpoints/lstm_model/2022-Nov-17-11-56-25
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd237fddb10>

In [None]:
y_lstm_preds_test = model_lstm.predict(test_texts)
y_lstm_preds_test = tf.squeeze(tf.round(y_lstm_preds_test),axis=1)



In [None]:
y_lstm_preds_train = model_lstm.predict(train_texts)
y_lstm_preds_train = tf.squeeze(tf.round(y_lstm_preds_train),axis=1)



In [None]:
print("training scores",'-'*80)
print(calculate_results(train_labels,y_lstm_preds_train))
print("testing scores",'-'*80)
print(calculate_results(test_labels,y_lstm_preds_test))

training scores --------------------------------------------------------------------------------
{'accuracy': 99.896, 'precision': 0.99896127734087, 'recall': 0.99896, 'f1': 0.9989599993343997}
testing scores --------------------------------------------------------------------------------
{'accuracy': 80.67999999999999, 'precision': 0.8068430149721596, 'recall': 0.8068, 'f1': 0.8067932287982138}


###Universal Sentence Encoder

In [None]:
import tensorflow_hub as hub
from tensorflow.keras import layers
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",input_shape=[],dtype="string",name="UniversalSE")


In [None]:
model_use_encoder = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(128,activation="relu"),
    layers.Dense(1,activation="sigmoid")
],name="use_model")

In [None]:
model_use_encoder.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(),metrics=['accuracy'])

In [None]:
model_use_encoder.fit(x=train_texts,y=train_labels,epochs=5,callbacks=[create_tf_callback("use_encoder_model")])

./training_checkpoints/use_encoder_model/2022-Nov-17-12-13-54
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd1c767dbd0>

In [None]:
y_use_encoder_preds_test = model_use_encoder.predict(test_texts)
y_use_encoder_preds_test = tf.squeeze(tf.round(y_use_encoder_preds_test),axis=1)



In [None]:
y_use_encoder_preds_train = model_use_encoder.predict(train_texts)
y_use_encoder_preds_train = tf.squeeze(tf.round(y_use_encoder_preds_train),axis=1)



In [None]:
print("training scores",'-'*80)
print(calculate_results(train_labels,y_use_encoder_preds_train))
print("testing scores",'-'*80)
print(calculate_results(test_labels,y_use_encoder_preds_test))

training scores --------------------------------------------------------------------------------
{'accuracy': 87.66000000000001, 'precision': 0.877181960987756, 'recall': 0.8766, 'f1': 0.8765523825999192}
testing scores --------------------------------------------------------------------------------
{'accuracy': 85.87599999999999, 'precision': 0.859574709994918, 'recall': 0.85876, 'f1': 0.8586799506712582}


###Transfer Learning BERT

In [14]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 17.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 72.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [15]:
from transformers import BertTokenizer,TFBertForSequenceClassification

In [17]:
bert_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_name,
                                          add_special_tokens=True,
                                          do_lower_case=False,
                                          max_length=150,
                                          pad_to_max_length=True)

In [19]:
def bert_encoder(review):
  encoded = tokenizer.encode_plus(review,
                                  add_special_tokens=True,
                                  max_length=150,
                                  pad_to_max_length=True,
                                  return_attention_mask=True,
                                  return_token_type_ids=True,
                                  truncation=True)
  return encoded['input_ids'], encoded['token_type_ids'],encoded['attention_mask']

In [20]:
bert_train = np.array([bert_encoder(text) for text in train_texts])
bert_label = tf.keras.utils.to_categorical(train_labels,num_classes=2)



In [22]:
tr_reviews, tr_segments, tr_masks = np.split(bert_train, 3, axis=1)

In [23]:
tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

In [24]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y

In [25]:
train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews, tr_masks, tr_segments, bert_label)).map(example_to_features).shuffle(100).batch(16)

In [26]:
bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [28]:
print("Fine-tuning BERT on IMDB")
bert_history = bert_model.fit(train_ds, epochs=5)

Fine-tuning BERT on IMDB
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
bert_model.save("/content/drive/MyDrive/NLP/sentiment_analysis/imdb_sentiment_analysis/saved_model_tf",save_format="tf")



In [30]:
bert_test = np.array([bert_encoder(r) for r in test_texts])
bert_test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=2)



In [32]:
ts_reviews, ts_segments, ts_masks = np.split(bert_test, 3, axis=1)
ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()

test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews, ts_masks, ts_segments, bert_test_labels)).map(example_to_features).shuffle(100).batch(16)

In [33]:
bert_model.evaluate(test_ds)



[0.4671766757965088, 0.8831599950790405]

In [82]:
bert_test_labels

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [118]:
from IPython.core.display import display, HTML
def predict(input_sentence):
  sentence_texts = [input_sentence]
  sentence = np.array([bert_encoder(text) for text in sentence_texts])
  label = tf.keras.utils.to_categorical([0],num_classes=2)
  p_reviews, p_segments, p_masks = np.split(sentence, 3, axis=1)
  p_reviews = np.array([p_reviews.squeeze()])
  p_segments = np.array([p_segments.squeeze()])
  p_masks = np.array([p_masks.squeeze()])
  test_p = tf.data.Dataset.from_tensor_slices((p_reviews, p_masks, p_segments, label)).map(example_to_features).shuffle(100).batch(16)
  pred = bert_model.predict(test_p)["logits"][0] 
  if np.argmax(pred) == 0:
    prediction = "negative"
    color = "red"
  else:
    prediction = "positive"
    color = "green"
  display(HTML(f'<p>review:{input_sentence}</p><p>prediction is: <strong style="color:{color};">{prediction}</strong></p>'))





In [119]:
predict("This film has got to be the epitome of terrible writing and should be a classroom example of 'what not to do' when writing a screenplay. Why would Joshua take on (clearly) amateur writer Adam Gaines script is beyond me. Even his good directing and excellent cinematography could not save this disaster.")





In [120]:
predict("It is no wonder that the film has such a high rating, it is quite literally breathtaking. What can I say that hasn't said before? Not much, it's the story, the acting, the premise, but most of all, this movie is about how it makes you feel. Sometimes you watch a film, and can't remember it days later, this film loves with you, once you've seen it, you don't forget.The ultimate story of friendship, of hope, and of life, and overcoming adversity.I understand why so many class this as the best film of all time, it isn't mine, but I get it. If you haven't seen it, or haven't seen it for some time, you need to watch it, it's amazing. 10/10.")



In [121]:
predict("awful. a dreadful disgrace. the film is the most cliched in cinematic history and is worthy of no respect. even worse is how it is so flawless yet it still is of no quality. i do not know how this film can be revered over others such as psycho, apocalypse now, raging bull, easy rider, the godfather, the killing fields, some like it hot and casablanca amongst others. shockingly horrendous")

