# Get helper function

In [1]:
# Download helper functions script
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-06-11 19:21:20--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: 'helper_functions.py'

     0K ..........                                            100%  338K=0.03s

2024-06-11 19:21:21 (338 KB/s) - 'helper_functions.py' saved [10246/10246]



In [2]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves
from helper_functions import compare_historys

# Download the text dataset

In [3]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2024-06-11 19:23:07--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.27.27, 142.250.204.155, 172.217.27.59, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.27.27|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: 'nlp_getting_started.zip'

     0K .......... .......... .......... .......... ..........  8%  224K 2s
    50K .......... .......... .......... .......... .......... 16%  311K 2s
   100K .......... .......... .......... .......... .......... 25%  919K 1s
   150K .......... .......... .......... .......... .......... 33%  738K 1s
   200K .......... .......... .......... .......... .......... 42%  917K 1s
   250K .......... .......... .......... .......... .......... 50% 1.19M 1s
   300K .......... .......... .......... .......... .......... 59% 1.38M 0s
   350K .......... .......... .......... .

# Visualizing a text dataset

In [4]:
import pandas as pd


train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_df_shuffled = train_df.sample(
    frac=1,
    random_state=42,
)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [6]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [8]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [9]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [10]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 0 (not real disaster)
Text:
I'm battling monsters I'm pulling you out of the burning buildings and you say I'll give you anything but you never come through.

---

Target: 0 (not real disaster)
Text:
@PerkPearl that's just not on. I'd be traumatised are you OK? The car has gone and now for #GBBO and relax.....

---

Target: 1 (real disaster)
Text:
Stay vigilent. Civil liberties are under constant attack. #nativehuman #myreligion  https://t.co/WWu070Tjej

---

Target: 0 (not real disaster)
Text:
Businesses are deluged with invoices. Make yours stand out with colour or shape and it's likely to ris. togthe top of the pay' pile.

---

Target: 1 (real disaster)
Text:
Beat:B2 MOTOR VEHICLE COLLISION at N 35 ST / FREMONT AV N reported on 8/5/2015 6:52 PM Call# 15000270364

---



# Split data into training and validation sets

In [12]:
from sklearn.model_selection import train_test_split


# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [13]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [14]:
# View the first 10 training sentences and their labels
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

# Converting text into numbers (tokenization)

In [16]:
import tensorflow as tf
from keras.layers import TextVectorization


text_vectorizer = TextVectorization(max_tokens=None,  # how many words in the vocabulary (all of the different words in your text)
                                   standardize="lower_and_strip_punctuation",  # how to process text,
                                   split="whitespace",  # how to split tokens
                                   ngrams=None,  # createa group of n words
                                   output_mode='int',  # how to map tokens to numbers
                                   output_sequence_length=None)  # how long should the output sequence of tokens be?
                                   # pad_to_max_tokens=True)  # not valid if using max_tokens=None

In [17]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [18]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [19]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [20]:
# Create sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [21]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[4001,   14, 2423, 1655, 2062, 1681, 1563, 1814, 2578, 1094, 1112,
        1110,    0,    0,    0]], dtype=int64)>

In [22]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


# Create an Embedding using an Embedding Layer

In [23]:
from keras import layers


tf.random.set_seed(42)

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding

<keras.layers.core.embedding.Embedding at 0x1467599fd90>

In [24]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Tube strike = absolute pandemonium      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.02195776, -0.04408492, -0.00264193, ...,  0.01023369,
         -0.04462695,  0.00698755],
        [ 0.01419356,  0.00209508, -0.04668857, ..., -0.00961052,
          0.01354796,  0.01690483],
        [-0.01322901,  0.04277768, -0.04807333, ...,  0.02299713,
          0.01204659, -0.04559833],
        ...,
        [-0.0438178 ,  0.03353414, -0.03639407, ..., -0.04533163,
         -0.01515562,  0.03444636],
        [-0.0438178 ,  0.03353414, -0.03639407, ..., -0.04533163,
         -0.01515562,  0.03444636],
        [-0.0438178 ,  0.03353414, -0.03639407, ..., -0.04533163,
         -0.01515562,  0.03444636]]], dtype=float32)>

In [26]:
# Check out a single token's embedding
sample_embed[0][0]  # each token has the shape of 128 length vector

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.02195776, -0.04408492, -0.00264193,  0.01697295,  0.01030587,
       -0.010657  , -0.04144052,  0.0495186 , -0.04548081,  0.00380947,
        0.0004392 , -0.04304041,  0.00479056, -0.01241476, -0.03919909,
        0.04720372, -0.0290094 ,  0.00553279, -0.04732583, -0.04549764,
        0.00394911, -0.03393529, -0.02948035, -0.02874172,  0.03149453,
       -0.02743732,  0.03827704, -0.03615243, -0.02170496, -0.03993218,
       -0.02705944, -0.0067757 , -0.00732018,  0.04915169,  0.01471293,
        0.00813898,  0.01571507, -0.03922136,  0.01070917,  0.03185648,
        0.03497685, -0.03801596,  0.04931749,  0.03117741,  0.03102731,
        0.0008598 ,  0.04705645,  0.0147854 ,  0.01180649, -0.01200588,
        0.01181237, -0.00216386, -0.02633499, -0.04541038, -0.00519886,
        0.0102348 ,  0.02265454,  0.01546599, -0.00718049, -0.00076743,
       -0.03704584,  0.04150945,  0.04520968, -0.00886046, -0.02637147,
       -0.039361

# Modeling a text dataset

## Model 0: Naive Bayes

### Construct the model

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


model_0 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),  # convert word to number using tfidf
    ("clf", MultinomialNB())  # model the text
])

model_0.fit(train_sentences, train_labels)

In [29]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


### Creating an evaluation function for our model experiment
* Accuracy
* Precision
* Recall
* F1-score

In [30]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def calculate_results(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    return {
        "Accuracy": accuracy_score(y_true, y_pred) * 100,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

In [33]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'Accuracy': 79.26509186351706,
 'Precision': 0.8111390004213173,
 'Recall': 0.7926509186351706,
 'F1 Score': 0.7862189758049549}

## Model 1: A simple dense model

In [34]:
# Create tensorboard callback (need to create a new one for each model)
from helper_functions import create_tensorboard_callback

# Create directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [35]:
# Build model with the Functional API
from tensorflow.keras import layers


inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

In [36]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [37]:
# Get a summary of the model
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [38]:
# Fit the model
model_1_history = model_1.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR, 
                                                                     experiment_name="simple_dense_model")])

Saving TensorBoard log files to: model_logs/simple_dense_model/20240612-204020
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
# Check the results
model_1.evaluate(val_sentences, val_labels)



[0.476077675819397, 0.787401556968689]

In [40]:
embedding.weights

[<tf.Variable 'embedding_1/embeddings:0' shape=(10000, 128) dtype=float32, numpy=
 array([[-0.02806053,  0.05429746, -0.01912697, ..., -0.02976452,
         -0.03476851,  0.05545815],
        [ 0.01847897, -0.04428495,  0.00932562, ..., -0.01920958,
         -0.00406261, -0.0316462 ],
        [-0.02286938, -0.0090783 ,  0.01303513, ..., -0.00271997,
         -0.01061927,  0.02704275],
        ...,
        [-0.01755588, -0.03382739, -0.04134742, ...,  0.00881166,
         -0.01017302,  0.03170576],
        [ 0.06069821, -0.01166783,  0.02188563, ...,  0.02277098,
         -0.04345196,  0.01859548],
        [ 0.07306253,  0.0521339 ,  0.02740375, ...,  0.0764282 ,
         -0.04888595,  0.09834759]], dtype=float32)>]

In [41]:
embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
print(embed_weights.shape)

(10000, 128)


In [42]:
# Make predictions (these come back in the form of probabilities)
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[:10] # only print out the first 10 prediction probabilities



array([[0.40901214],
       [0.7461467 ],
       [0.9977737 ],
       [0.11054722],
       [0.10911109],
       [0.936977  ],
       [0.9150944 ],
       [0.99277014],
       [0.97000825],
       [0.27399358]], dtype=float32)

In [43]:
# Turn prediction probabilities into single-dimension tensor of floats
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs)) # squeeze removes single dimensions
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [44]:
# Calculate model_1 metrics
model_1_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_1_preds)
model_1_results

{'Accuracy': 78.74015748031496,
 'Precision': 0.7914920592553047,
 'Recall': 0.7874015748031497,
 'F1 Score': 0.7846966492209201}

In [45]:
# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

compare_baseline_to_new_results(baseline_results=baseline_results, 
                                new_model_results=model_1_results)

Baseline Accuracy: 79.27, New Accuracy: 78.74, Difference: -0.52
Baseline Precision: 0.81, New Precision: 0.79, Difference: -0.02
Baseline Recall: 0.79, New Recall: 0.79, Difference: -0.01
Baseline F1 Score: 0.79, New F1 Score: 0.78, Difference: -0.00


### Exercise for model 1

In [50]:
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
# x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
x = layers.Flatten()(x)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1_excercise = tf.keras.Model(inputs, outputs, name="model_1_dense_excercise") # construct the model

In [51]:
# Compile model
model_1_excercise.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [52]:
model_1_excercise.summary()

Model: "model_1_dense_excercise"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense_2 (Dense)             (None, 1)                 1921      
                                                                 
Total params: 1,281,921
Trainable params: 1,281,921
Non-trainable params: 0
_________________________________

In [53]:
# Fit the model
model_1_history = model_1_excercise.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR, 
                                                                     experiment_name="simple_dense_model_excersise")])

Saving TensorBoard log files to: model_logs/simple_dense_model_excersise/20240612-204759
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Visualize learned embeddings

In [54]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [55]:
# Get the weight matrix of embedding layer 
# (these are the numerical patterns between the text in the training dataset the model has learned)
embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
print(embed_weights.shape) # same size as vocab size and embedding_dim (each word is a embedding_dim size vector)

(10000, 128)


In [56]:
# # Code below is adapted from: https://www.tensorflow.org/tutorials/text/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk
import io

# Create output writers
out_v = io.open("embedding_vectors.tsv", "w", encoding="utf-8")
out_m = io.open("embedding_metadata.tsv", "w", encoding="utf-8")

# Write embedding vectors and words to file
for num, word in enumerate(words_in_vocab):
  if num == 0: 
     continue # skip padding token
  vec = embed_weights[num]
  out_m.write(word + "\n") # write words to file
  out_v.write("\t".join([str(x) for x in vec]) + "\n") # write corresponding word vector to file
out_v.close()
out_m.close()

# Download files locally to upload to Embedding Projector
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download("embedding_vectors.tsv")
  files.download("embedding_metadata.tsv")

# Recurrent Network

## Model 2: LSTM (Long-Short Term Memory)

In [62]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)


model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")


# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)


In [63]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [64]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [65]:
# Fit model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 
                                                                     "LSTM")])

Saving TensorBoard log files to: model_logs/LSTM/20240612-213502
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [66]:
# Make predictions on the validation dataset
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs.shape, model_2_pred_probs[:10] # view the first 10



((762, 1),
 array([[0.04785112],
        [0.75113183],
        [0.9987698 ],
        [0.07021218],
        [0.00588337],
        [0.99883825],
        [0.722011  ],
        [0.9995346 ],
        [0.99919516],
        [0.33259225]], dtype=float32))

In [67]:
# Round out predictions and reduce to 1-dimensional array
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [68]:
# Calculate LSTM model results
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

{'Accuracy': 75.59055118110236,
 'Precision': 0.7569182665945199,
 'Recall': 0.7559055118110236,
 'F1 Score': 0.753827541903972}

In [69]:
# Compare model 2 to baseline
compare_baseline_to_new_results(baseline_results, model_2_results)

Baseline Accuracy: 79.27, New Accuracy: 75.59, Difference: -3.67
Baseline Precision: 0.81, New Precision: 0.76, Difference: -0.05
Baseline Recall: 0.79, New Recall: 0.76, Difference: -0.04
Baseline F1 Score: 0.79, New F1 Score: 0.75, Difference: -0.03


## Model 3: GRU (Gated Recurrent Unit)

In [70]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)

model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_3")

# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
# x = layers.GRU(64, return_sequences=True) # stacking recurrent cells requires return_sequences=True
x = layers.GRU(64)(x) 
# x = layers.Dense(64, activation="relu")(x) # optional dense layer after GRU cell
outputs = layers.Dense(1, activation="sigmoid")(x)

model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [71]:
# Compile GRU model
model_3.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [72]:
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [73]:
# Fit model
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "GRU")])

Saving TensorBoard log files to: model_logs/GRU/20240612-214449
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [74]:
# Make predictions on the validation data
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs.shape, model_3_pred_probs[:10]



((762, 1),
 array([[0.27890548],
        [0.86471534],
        [0.9970368 ],
        [0.14751235],
        [0.01219409],
        [0.98965645],
        [0.8200518 ],
        [0.9962935 ],
        [0.99679273],
        [0.4353803 ]], dtype=float32))

In [75]:
# Convert prediction probabilities to prediction classes
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [76]:
# Calcuate model_3 results
model_3_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_3_preds)
model_3_results

{'Accuracy': 77.69028871391076,
 'Precision': 0.7771036524564976,
 'Recall': 0.7769028871391076,
 'F1 Score': 0.7757784765658782}

In [77]:
# Compare to baseline
compare_baseline_to_new_results(baseline_results, model_3_results)

Baseline Accuracy: 79.27, New Accuracy: 77.69, Difference: -1.57
Baseline Precision: 0.81, New Precision: 0.78, Difference: -0.03
Baseline Recall: 0.79, New Recall: 0.78, Difference: -0.02
Baseline F1 Score: 0.79, New F1 Score: 0.78, Difference: -0.01


## Model 4: Bidirectional RNN model

In [78]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)

model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_4")

# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [79]:
# Compile
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [80]:
model_4.summary()

Model: "model_4_Bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_4 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_6 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,3

In [81]:
# Fit the model (takes longer because of the bidirectional layers)
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "bidirectional_RNN")])

Saving TensorBoard log files to: model_logs/bidirectional_RNN/20240612-214828
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [82]:
# Make predictions with bidirectional RNN on the validation data
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[0.0584543 ],
       [0.9058534 ],
       [0.9991893 ],
       [0.17715803],
       [0.0059738 ],
       [0.99821585],
       [0.9757966 ],
       [0.99950767],
       [0.9995926 ],
       [0.3643317 ]], dtype=float32)

In [83]:
# Convert prediction probabilities to labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [84]:
# Calculate bidirectional RNN model results
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results

{'Accuracy': 76.24671916010499,
 'Precision': 0.7620868493595442,
 'Recall': 0.7624671916010499,
 'F1 Score': 0.7617981947844736}

In [85]:
# Check to see how the bidirectional model performs against the baseline
compare_baseline_to_new_results(baseline_results, model_4_results)

Baseline Accuracy: 79.27, New Accuracy: 76.25, Difference: -3.02
Baseline Precision: 0.81, New Precision: 0.76, Difference: -0.05
Baseline Recall: 0.79, New Recall: 0.76, Difference: -0.03
Baseline F1 Score: 0.79, New F1 Score: 0.76, Difference: -0.02


# Convolutional Neural Networks for Text

## Model 5: Conv1D

In [86]:
# Test out the embedding, 1D convolutional and max pooling
embedding_test = embedding(text_vectorizer(["this is a test sentence"])) # turn target sentence into embedding
conv_1d = layers.Conv1D(filters=32, kernel_size=5, activation="relu") # convolve over target sequence 5 words at a time
conv_1d_output = conv_1d(embedding_test) # pass embedding through 1D convolutional layer
max_pool = layers.GlobalMaxPool1D() 
max_pool_output = max_pool(conv_1d_output) # get the most important features
embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))

In [87]:
# See the outputs of each layer
embedding_test[:1], conv_1d_output[:1], max_pool_output[:1]

(<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
 array([[[-0.00880439,  0.03312613,  0.0045855 , ..., -0.05370551,
          -0.00727392, -0.00203434],
         [ 0.02922052,  0.00248175,  0.04676467, ...,  0.01209346,
           0.02807192,  0.00223805],
         [-0.02661892,  0.08548723,  0.0130639 , ...,  0.02278049,
          -0.04110915,  0.00594871],
         ...,
         [-0.02609645,  0.06355414, -0.02558475, ..., -0.02033889,
          -0.03947812,  0.0635902 ],
         [-0.02609645,  0.06355414, -0.02558475, ..., -0.02033889,
          -0.03947812,  0.0635902 ],
         [-0.02609645,  0.06355414, -0.02558475, ..., -0.02033889,
          -0.03947812,  0.0635902 ]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 11, 32), dtype=float32, numpy=
 array([[[0.        , 0.        , 0.09847955, 0.        , 0.00720844,
          0.12232445, 0.        , 0.        , 0.        , 0.03854788,
          0.05784045, 0.        , 0.05246978, 0.03624558, 0.        ,
          0.09893265, 0.

In [88]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)

model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_5")

# Create 1-dimensional convolutional layer to model sequences
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

# Compile Conv1D model
model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of our 1D convolution model
model_5.summary()

Model: "model_5_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 32)            20512     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 32)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_7 (Dense)             (None, 1)              

In [89]:
# Fit the model
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 
                                                                     "Conv1D")])

Saving TensorBoard log files to: model_logs/Conv1D/20240612-215700
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [90]:
# Make predictions with model_5
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]



array([[0.5564974 ],
       [0.7489508 ],
       [0.99976486],
       [0.07544567],
       [0.01627248],
       [0.99562067],
       [0.93693984],
       [0.9976725 ],
       [0.9994978 ],
       [0.16003236]], dtype=float32)

In [91]:
# Convert model_5 prediction probabilities to labels
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [92]:
# Calculate model_5 evaluation metrics 
model_5_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_5_preds)
model_5_results

{'Accuracy': 78.4776902887139,
 'Precision': 0.7862392297712013,
 'Recall': 0.7847769028871391,
 'F1 Score': 0.7830611097687449}

In [93]:
# Calculate model_5 evaluation metrics 
model_5_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_5_preds)
model_5_results

{'Accuracy': 78.4776902887139,
 'Precision': 0.7862392297712013,
 'Recall': 0.7847769028871391,
 'F1 Score': 0.7830611097687449}

# Using pretrained Embeddings (transfer learning for NLP)

## Model 6: Tensorflow Hub Pretrained Sentence Encoder (Not working on yet)