# Natural Language Processing

In [1]:
# Importing TF and checking the version
import tensorflow as tf

print(tf.__version__)

2.10.1


In [2]:
# Importing helper functions
from DanielBourke_HelperFunctions import create_tensorboard_callback, plot_loss_curves, compare_historys

### Analysing text dataset

In [3]:
# Loading data
import pandas as pd

train_df = pd.read_csv("NLP_text/train.csv")
test_df = pd.read_csv("NLP_text/test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Shuffling training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
# Checking test dataframe
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# Checking number of training records
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
# Checking total number of samples
len(train_df), len(test_df)

(7613, 3263)

In [8]:
# Visualising random samples
import random

random_index = random.randint(0, len(train_df) - 5)

for row in train_df_shuffled[["text", "target"]][random_index : random_index + 5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}")
    print("---")

Target: 1 (real disaster)
Text:
Sad that biker beatdown derailed his pro-democracy work as @NYPDnews undercover: http://t.co/iHHRKG4V1S. http://t.co/aryU5qNgJJ
---
Target: 0 (not real disaster)
Text:
#BBShelli seems pretty sure she's the one that's going to stay! #BB17
---
Target: 0 (not real disaster)
Text:
@engineshed Great atmosphere at the British Lion gig tonight. Hearing is wrecked. http://t.co/oMNBAtJEAO
---
Target: 1 (real disaster)
Text:
The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/aueZxZA5ak
---
Target: 1 (real disaster)
Text:
24 killed in two simultaneous rail crash as acute floods derail the two trains #India #mumbai... http://t.co/b0ZwI0qPTU
---


### Creating validation data

In [9]:
# Splitting dataset
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size=0.1, # Allocating 10% to validation data
    random_state=42
)

In [10]:
# Checking dataset length
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [11]:
# Checking the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

### Converting text to numbers

In [12]:
# Using text vectorisation
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=False
)

In [13]:
# Finding the average number of tokens
round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))

15

In [14]:
# Setting up text vectorisation variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)

In [15]:
# Fitting text vectorisation to the training dataset
text_vectorizer.adapt(train_sentences)

In [16]:
# Create a sample sentnence and tokenise it
sample_sentence = "There is a flood in my street"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [17]:
# Tokenising random sentence from the training set
random_sentence = random.choice(train_sentences)
print(
    f"Original text:\n{random_sentence},\ntokenised version:\n{text_vectorizer([random_sentence])}"
)

Original text:
FAAN orders evacuation of abandoned aircraft at MMA: FAAN noted that the action had become necessary due to re... http://t.co/ZUqgvJnEQA,
tokenised version:
[[1679 1268  245    6 1441  661   17 2041 1679 5022   16    2  866   94
   791]]


In [18]:
# Getting unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

len(words_in_vocab), top_5_words, bottom_5_words

(10000,
 ['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

### Creating Embedding layer

In [19]:
# Defining the layer
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length
)
embedding

<keras.layers.core.embedding.Embedding at 0x26899a724f0>

In [20]:
# Get a random sentence
random_sentence = random.choice(train_sentences)
print(
    f"Original text:\n{random_sentence},\nembedded version:"
)

# Embed random sentence
embed_sentence = embedding(text_vectorizer([random_sentence]))
embed_sentence

Original text:
The Martyrs Who Kept Udhampur Terrorists at Bay Averted a Massacre: It was two youngÛ_ http://t.co/nux5XfPV2d SPSå¨,
embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04659598, -0.01593012, -0.04225612, ...,  0.02433849,
         -0.04441274, -0.0047609 ],
        [-0.0398303 ,  0.0433728 ,  0.01470615, ...,  0.02150333,
          0.04934101,  0.04634135],
        [-0.02800899, -0.01728942,  0.02363263, ..., -0.01828502,
          0.01993917,  0.00403327],
        ...,
        [ 0.02714442,  0.0467861 , -0.03147028, ...,  0.03364256,
         -0.01096548,  0.04719095],
        [-0.00249439, -0.04945217,  0.03234044, ...,  0.00763072,
         -0.02132593, -0.03810417],
        [ 0.0090407 ,  0.02881757,  0.04324982, ...,  0.02212656,
          0.00751988, -0.02301532]]], dtype=float32)>

In [21]:
# Checking single token's embedding
embed_sentence[0][0], embed_sentence[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.04659598, -0.01593012, -0.04225612,  0.00725685,  0.04245068,
        -0.03796609, -0.04970623,  0.02677557,  0.04829607, -0.02060782,
        -0.02805123, -0.01698004,  0.03496093, -0.01372444,  0.04992953,
         0.0311836 , -0.02447726,  0.04033473, -0.04035126, -0.02185392,
         0.00309285, -0.02325898,  0.02069867,  0.04724659,  0.02922959,
        -0.0373191 , -0.04904573,  0.00363798,  0.03871217,  0.02440611,
        -0.03709758,  0.03995449, -0.0329757 ,  0.03792864, -0.02959371,
         0.02402164, -0.03080671, -0.02450016, -0.01905632,  0.0023818 ,
         0.01369375,  0.02132415, -0.03098136,  0.03727745,  0.01019982,
         0.00921239,  0.04298681,  0.01907262, -0.02483143, -0.00242788,
         0.04751453, -0.02078112,  0.02486658,  0.04702964,  0.02970495,
        -0.03114016, -0.02835112,  0.00191686,  0.0052084 , -0.02857051,
         0.02991304, -0.0223396 , -0.01663087,  0.0491769 ,  0.02129206,
  

### Building base model

In [22]:
# Using SKLearn to build base model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenisation and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # Convert words to numbers
    ("clf", MultinomialNB()) # Model the text
])

# Fit the pipeline to training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [23]:
# Evaluating baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Achieved accuracy: {baseline_score * 100:.2f}%")

Achieved accuracy: 79.27%


In [24]:
# Making predictions
baseline_predictions = model_0.predict(val_sentences)
baseline_predictions[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

### Function to evaluate model performance

In [25]:
# Importing SKLearn functions
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Developing function to evaluate accuracy, precision, recall and F1 scor
def calculate_results(y_true, y_pred):
    """
    Evaluate binary classification model
    """
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and F1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1
    }
    return model_results

In [26]:
# Getting baseline results
baseline_results = calculate_results(
    y_true=val_labels,
    y_pred=baseline_predictions
)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

### Model 1: a simple dense model

In [27]:
# Creating tensorboard callback
from DanielBourke_HelperFunctions import create_tensorboard_callback

# Creating log directory
SAVE_DIR = "model_logs"

In [28]:
# Building model with Functional API
from tensorflow.keras import layers

inputs = layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs) # Turn the input text into numbers
x = embedding(x) # Create embedding of numberised inputs
x = layers.GlobalAveragePooling1D()(x) # Condence the feature vector
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [29]:
# Getting model summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [30]:
# Compiling model
model_1.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [31]:
# Fitting the model
history_1 = model_1.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_1_dense"
    )]
)

Saving TensorBoard log files to: model_logs/model_1_dense/20230113-231416
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Checking the results
model_1.evaluate(val_sentences, val_labels)



[0.48084497451782227, 0.787401556968689]

In [33]:
# Making predictions
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape



(762, 1)

In [34]:
# First 10 predictions
model_1_pred_probs[:10]

array([[0.40274784],
       [0.7934085 ],
       [0.99762815],
       [0.07621262],
       [0.11843665],
       [0.9315513 ],
       [0.90787256],
       [0.9928179 ],
       [0.9659077 ],
       [0.18009627]], dtype=float32)

In [35]:
# Converting model predictions to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [36]:
# Calculating model 1 results
model_1_results = calculate_results(
    y_true=val_labels,
    y_pred=model_1_preds
)
model_1_results

{'accuracy': 78.74015748031496,
 'precision': 0.7932296029485675,
 'recall': 0.7874015748031497,
 'f1': 0.7841130596930417}

In [37]:
# Retrieving baseline results
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [38]:
# Comparing model results
import numpy as np

np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

### Visualising learned embeddings

In [39]:
# Getting vocabulary from the text vectorisation layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [40]:
# Checking model 1 details
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [41]:
# Getting the weights
embed_weights = model_1.get_layer("embedding").get_weights()
embed_weights, embed_weights[0].shape

([array([[-0.01260132, -0.02912213, -0.01750636, ..., -0.00884185,
           0.05371519,  0.04908132],
         [-0.0499537 ,  0.04859574,  0.0076521 , ...,  0.01239407,
           0.06055391,  0.05562896],
         [-0.05615652, -0.00983749, -0.05030296, ...,  0.01532918,
          -0.03331523,  0.00353365],
         ...,
         [-0.02765653,  0.01126231, -0.00529394, ...,  0.00481617,
           0.04849471,  0.04487426],
         [-0.06867731,  0.08982918, -0.01022235, ..., -0.00427938,
           0.06332158,  0.01206129],
         [-0.0603082 ,  0.06891461, -0.0374227 , ..., -0.04026163,
           0.09135257,  0.08985952]], dtype=float32)],
 (10000, 128))

In [42]:
# Creating embedding files (sample from TF Word Embedding documentation)
import io

out_v = io.open("vectors.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(words_in_vocab):
    if index == 0:
        continue # Skip 0, as it's a padding
    vec = embed_weights[0][index]
    out_v.write("\t".join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")

out_v.close()
out_m.close()

Generated vectors.tsv and metadata.tsv can be uploaded into Tensorflow Embedding Projector tool at https://projector.tensorflow.org/ to visualise weights in 3D space.

### Model 2 - RNN (Recurrent Neural Network) with LSTM (Long Short-Term Memory)

In [43]:
# CReating an RNN model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
# x = layers.LSTM(units=64, return_sequences=True)(x) # Sequences required when you stack LSTM layers
# print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
# x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)


In [44]:
# Checking model's summary
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [45]:
# Compiling the model
model_2.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [46]:
# Fittting the model
history_2 = model_2.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_2_LSTM"
    )]
)

Saving TensorBoard log files to: model_logs/model_2_LSTM/20230113-231447
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
# Meking predictions
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[7.3442520e-03],
       [8.8728398e-01],
       [9.9965590e-01],
       [3.2201298e-02],
       [2.6737017e-04],
       [9.9846673e-01],
       [8.4208000e-01],
       [9.9981773e-01],
       [9.9961007e-01],
       [6.0151535e-01]], dtype=float32)

In [48]:
# Converting pred probs to label format
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [49]:
#Calculate model 2 results
model_2_results = calculate_results(
    y_true=val_labels,
    y_pred=model_2_preds
)
model_2_results

{'accuracy': 77.95275590551181,
 'precision': 0.7800661437025787,
 'recall': 0.7795275590551181,
 'f1': 0.7782139444386276}

### Model 3 - RNN with GRU (gated recurrent unit)

In [50]:
# Building the model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.GRU(64, return_sequences=True)(x) # Sequences, if GRU are stuck on each other
x = layers.GRU(64)(x)
# x = layers.Dense(64, activation="relu")(x)
# x = layers.GlobalAveragePooling1D()(x) # If we want to consolidate GRU layer with sequences
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [51]:
# Getting model summary
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [52]:
# Compiling the model
model_3.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [53]:
# Fitting the model
history_3 = model_3.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_3_GRU"
    )]
)

Saving TensorBoard log files to: model_logs/model_3_GRU/20230113-231536
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
# Meking predictions
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs



array([[2.24695238e-03],
       [9.31467831e-01],
       [9.99912560e-01],
       [1.12920970e-01],
       [1.24091821e-04],
       [9.99471247e-01],
       [3.05579334e-01],
       [9.99956489e-01],
       [9.99906123e-01],
       [7.81440794e-01],
       [8.54744168e-04],
       [7.90591538e-01],
       [7.48686201e-04],
       [2.12362528e-01],
       [9.02552638e-05],
       [9.83669981e-03],
       [9.45435779e-04],
       [1.72430242e-03],
       [3.62405442e-02],
       [9.99589145e-01],
       [9.99454677e-01],
       [1.98438342e-04],
       [9.98781800e-01],
       [4.00665915e-03],
       [9.99913454e-01],
       [9.99920905e-01],
       [3.58351227e-03],
       [5.60379913e-03],
       [4.53075598e-04],
       [6.02238119e-01],
       [9.19585764e-01],
       [1.33129908e-02],
       [4.43103850e-01],
       [1.40487188e-02],
       [3.67412508e-01],
       [4.15496796e-01],
       [9.99824762e-01],
       [2.91292101e-01],
       [3.79499234e-02],
       [9.99941647e-01],


In [55]:
# Converting model 3 pred probs to label format
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 0., 1., 1., 1.], dtype=float32)>

In [56]:
# Calculate model 3 results
model_3_results = calculate_results(
    y_true=val_labels,
    y_pred=model_3_preds
)
model_3_results

{'accuracy': 77.16535433070865,
 'precision': 0.7732961359962456,
 'recall': 0.7716535433070866,
 'f1': 0.7695827090439606}

### Model 4 - Bidirectional RNN

In [57]:
# Create a model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_bidir")

In [58]:
# Checking model details
model_4.summary()

Model: "model_4_bidir"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,378,945
N

In [59]:
# Compiling the model
model_4.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [60]:
# Fitting the model
history_5 = model_4.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_4_bidir"
    )]
)

Saving TensorBoard log files to: model_logs/model_4_bidir/20230113-231612
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [61]:
# Making predictions
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[4.0618695e-02],
       [8.1061995e-01],
       [9.9994409e-01],
       [2.3289736e-01],
       [8.4356689e-05],
       [9.9965870e-01],
       [9.1179651e-01],
       [9.9998355e-01],
       [9.9995202e-01],
       [9.8412627e-01]], dtype=float32)

In [62]:
# Adjusting model's probs to the label format
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [63]:
# Checking results of predictions
model_4_results = calculate_results(
    y_true=val_labels,
    y_pred=model_4_preds
)
model_4_results

{'accuracy': 76.77165354330708,
 'precision': 0.7676331847037146,
 'recall': 0.7677165354330708,
 'f1': 0.766696834597351}

### Model 5 - Conv1D

In [68]:
# Testing embedding layer
from tensorflow.keras import layers

embedding_test = embedding(text_vectorizer("This is a test sentence"))
# print(embedding_test.shape)
embedding_test = tf.expand_dims(embedding_test, axis=0)
# print(embedding_test.shape)
conv_1d = layers.Conv1D(
    filters=32,
    kernel_size=5,
    activation="relu",
    padding="valid"
)
conv_1d_output = conv_1d(embedding_test)
max_pool = layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output)

embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))