# Natural Language Processing

In [1]:
# Importing TF and checking the version
import tensorflow as tf

print(tf.__version__)

2.10.1


In [2]:
# Importing helper functions
from DanielBourke_HelperFunctions import create_tensorboard_callback, plot_loss_curves, compare_historys

### Analysing text dataset

In [3]:
# Loading data
import pandas as pd

train_df = pd.read_csv("NLP_text/train.csv")
test_df = pd.read_csv("NLP_text/test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Shuffling training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
# Checking test dataframe
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# Checking number of training records
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
# Checking total number of samples
len(train_df), len(test_df)

(7613, 3263)

In [8]:
# Visualising random samples
import random

random_index = random.randint(0, len(train_df) - 5)

for row in train_df_shuffled[["text", "target"]][random_index : random_index + 5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}")
    print("---")

Target: 0 (not real disaster)
Text:
Officially skipping out on #FantasticFour/#Fant4stic/whatever the hashtag is. It's getting ANNIHILATED in reviews. Bummer.
---
Target: 0 (not real disaster)
Text:
I liked a @YouTube video http://t.co/5fR41TPzte Thorin's Thoughts - Riot and Sandbox Mode (LoL)
---
Target: 0 (not real disaster)
Text:
Parents are taking their kids to Burning Man and one 11 year old thinks it's 'better than... http://t.co/wp6V1BHhoQ
---
Target: 0 (not real disaster)
Text:
Learn How I Gained Access To The Secrets Of The Top Earners &amp; Used Them To Explode My Home Business Here: http://t.co/8rABhQrTh5 Please #RT
---
Target: 0 (not real disaster)
Text:
HereÛªs how media in Pakistan covered the capture of terrorist Mohammed Naved http://t.co/f7WqpCEkg2
---


### Creating validation data

In [9]:
# Splitting dataset
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size=0.1, # Allocating 10% to validation data
    random_state=42
)

In [10]:
# Checking dataset length
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [11]:
# Checking the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

### Converting text to numbers

In [12]:
# Using text vectorisation
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=False
)

In [13]:
# Finding the average number of tokens
round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))

15

In [14]:
# Setting up text vectorisation variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)

In [15]:
# Fitting text vectorisation to the training dataset
text_vectorizer.adapt(train_sentences)

In [16]:
# Create a sample sentnence and tokenise it
sample_sentence = "There is a flood in my street"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [17]:
# Tokenising random sentence from the training set
random_sentence = random.choice(train_sentences)
print(
    f"Original text:\n{random_sentence},\ntokenised version:\n{text_vectorizer([random_sentence])}"
)

Original text:
320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/yNXnvVKCDA | @djicemoon | #Dubstep #TrapMusic #DnB #EDM #Dance #IcesÛ_ http://t.co/weQPesENku,
tokenised version:
[[2582 2420 2428  966    1 2490 2133 2249 2138 1685 1307 2427    1    0
     0]]


In [18]:
# Getting unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

len(words_in_vocab), top_5_words, bottom_5_words

(10000,
 ['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

### Creating Embedding layer

In [19]:
# Defining the layer
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length
)
embedding

<keras.layers.core.embedding.Embedding at 0x19ca5941c40>

In [20]:
# Get a random sentence
random_sentence = random.choice(train_sentences)
print(
    f"Original text:\n{random_sentence},\nembedded version:"
)

# Embed random sentence
embed_sentence = embedding(text_vectorizer([random_sentence]))
embed_sentence

Original text:
@RosemaryTravale Do we all use the same weapon? 'cause we might be screwed XD,
embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.02209502,  0.04833481,  0.02057297, ..., -0.0417004 ,
          0.01724714,  0.00334507],
        [ 0.0044282 ,  0.02959353, -0.02985519, ..., -0.00032227,
          0.03369289, -0.0301164 ],
        [ 0.02713615, -0.00702845, -0.00376241, ..., -0.01400626,
         -0.0199459 , -0.00800296],
        ...,
        [-0.00922608, -0.04281855,  0.00149045, ..., -0.01199614,
          0.04132703, -0.00422592],
        [ 0.02289363, -0.01219679, -0.04508603, ...,  0.01462224,
          0.0351765 , -0.02655279],
        [ 0.04544933, -0.00372183,  0.03552462, ..., -0.03026444,
         -0.0371375 ,  0.02856531]]], dtype=float32)>

In [21]:
# Checking single token's embedding
embed_sentence[0][0], embed_sentence[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.02209502,  0.04833481,  0.02057297,  0.032789  ,  0.04318057,
        -0.03431219,  0.01098975,  0.03514931,  0.03978893,  0.01736709,
        -0.02824736, -0.02608452, -0.00248503, -0.02237294,  0.0012821 ,
        -0.00635846, -0.01773519,  0.03112418,  0.01703538, -0.02766299,
         0.03034944,  0.03847185, -0.00869383, -0.04557556, -0.04246776,
        -0.02699394,  0.03543986,  0.01170706, -0.02990152, -0.04815678,
         0.01021283,  0.01171887, -0.02144183,  0.04833951,  0.02251032,
         0.00594647, -0.0025661 , -0.04159676, -0.04176854, -0.00442631,
         0.02404336, -0.04970976, -0.00243689,  0.03470664, -0.02492545,
         0.00792212,  0.03716299, -0.02514007,  0.04799393,  0.04344555,
         0.0493938 , -0.04462188, -0.02766787,  0.04958842, -0.01133138,
         0.01706865,  0.04976422,  0.02401492, -0.0471971 , -0.00576581,
        -0.00843783, -0.0460158 , -0.0047438 , -0.02876973, -0.01069454,
  

### Building base model

In [22]:
# Using SKLearn to build base model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenisation and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # Convert words to numbers
    ("clf", MultinomialNB()) # Model the text
])

# Fit the pipeline to training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [23]:
# Evaluating baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Achieved accuracy: {baseline_score * 100:.2f}%")

Achieved accuracy: 79.27%


In [24]:
# Making predictions
baseline_predictions = model_0.predict(val_sentences)
baseline_predictions[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

### Function to evaluate model performance

In [25]:
# Importing SKLearn functions
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Developing function to evaluate accuracy, precision, recall and F1 scor
def calculate_results(y_true, y_pred):
    """
    Evaluate binary classification model
    """
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and F1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1
    }
    return model_results

In [26]:
# Getting baseline results
baseline_results = calculate_results(
    y_true=val_labels,
    y_pred=baseline_predictions
)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

### Model 1: a simple dense model

In [27]:
# Creating tensorboard callback
from DanielBourke_HelperFunctions import create_tensorboard_callback

# Creating log directory
SAVE_DIR = "model_logs"

In [28]:
# Building model with Functional API
from tensorflow.keras import layers

inputs = layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs) # Turn the input text into numbers
x = embedding(x) # Create embedding of numberised inputs
x = layers.GlobalAveragePooling1D()(x) # Condence the feature vector
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [29]:
# Getting model summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [30]:
# Compiling model
model_1.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [31]:
# Fitting the model
history_1 = model_1.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_1_dense"
    )]
)

Saving TensorBoard log files to: model_logs/model_1_dense/20230116-103108
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Checking the results
model_1.evaluate(val_sentences, val_labels)



[0.48205941915512085, 0.7847769260406494]

In [33]:
# Making predictions
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape



(762, 1)

In [34]:
# First 10 predictions
model_1_pred_probs[:10]

array([[0.3532249 ],
       [0.75160396],
       [0.99763644],
       [0.09741288],
       [0.09976958],
       [0.93916947],
       [0.91113394],
       [0.9932434 ],
       [0.9641323 ],
       [0.26156282]], dtype=float32)

In [35]:
# Converting model predictions to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [36]:
# Calculating model 1 results
model_1_results = calculate_results(
    y_true=val_labels,
    y_pred=model_1_preds
)
model_1_results

{'accuracy': 78.4776902887139,
 'precision': 0.790955383689072,
 'recall': 0.7847769028871391,
 'f1': 0.7812916448740085}

In [37]:
# Retrieving baseline results
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [38]:
# Comparing model results
import numpy as np

np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

### Visualising learned embeddings

In [39]:
# Getting vocabulary from the text vectorisation layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [40]:
# Checking model 1 details
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [41]:
# Getting the weights
embed_weights = model_1.get_layer("embedding").get_weights()
embed_weights, embed_weights[0].shape

([array([[ 0.03117545, -0.0198781 ,  0.04902324, ..., -0.04492381,
          -0.05209893,  0.04254517],
         [ 0.02417207, -0.01064174,  0.00071459, ..., -0.01533931,
          -0.0388462 ,  0.04234987],
         [-0.0505834 , -0.02030688,  0.00483891, ..., -0.03868733,
          -0.0411087 ,  0.04079808],
         ...,
         [ 0.04937395, -0.0193275 ,  0.01589176, ...,  0.01257899,
           0.04665932, -0.00394964],
         [ 0.01028025, -0.03333158,  0.07804684, ..., -0.06535052,
          -0.07028744,  0.03675681],
         [-0.02590463, -0.08515803,  0.05049384, ..., -0.10260323,
          -0.10310195,  0.11368717]], dtype=float32)],
 (10000, 128))

In [42]:
# Creating embedding files (sample from TF Word Embedding documentation)
import io

out_v = io.open("vectors.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(words_in_vocab):
    if index == 0:
        continue # Skip 0, as it's a padding
    vec = embed_weights[0][index]
    out_v.write("\t".join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")

out_v.close()
out_m.close()

Generated vectors.tsv and metadata.tsv can be uploaded into Tensorflow Embedding Projector tool at https://projector.tensorflow.org/ to visualise weights in 3D space.

### Model 2 - RNN (Recurrent Neural Network) with LSTM (Long Short-Term Memory)

In [43]:
# CReating an RNN model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
# x = layers.LSTM(units=64, return_sequences=True)(x) # Sequences required when you stack LSTM layers
# print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
# x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)


In [44]:
# Checking model's summary
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [45]:
# Compiling the model
model_2.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [46]:
# Fittting the model
history_2 = model_2.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_2_LSTM"
    )]
)

Saving TensorBoard log files to: model_logs/model_2_LSTM/20230116-103129
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
# Meking predictions
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[1.08379915e-01],
       [7.43641078e-01],
       [9.99641776e-01],
       [2.65806187e-02],
       [2.02441894e-04],
       [9.99169886e-01],
       [9.18037057e-01],
       [9.99792099e-01],
       [9.99619246e-01],
       [6.23820305e-01]], dtype=float32)

In [48]:
# Converting pred probs to label format
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [49]:
#Calculate model 2 results
model_2_results = calculate_results(
    y_true=val_labels,
    y_pred=model_2_preds
)
model_2_results

{'accuracy': 77.95275590551181,
 'precision': 0.7799065773530309,
 'recall': 0.7795275590551181,
 'f1': 0.7783167829714759}

### Model 3 - RNN with GRU (gated recurrent unit)

In [50]:
# Building the model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.GRU(64, return_sequences=True)(x) # Sequences, if GRU are stuck on each other
x = layers.GRU(64)(x)
# x = layers.Dense(64, activation="relu")(x)
# x = layers.GlobalAveragePooling1D()(x) # If we want to consolidate GRU layer with sequences
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [51]:
# Getting model summary
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [52]:
# Compiling the model
model_3.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [53]:
# Fitting the model
history_3 = model_3.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_3_GRU"
    )]
)

Saving TensorBoard log files to: model_logs/model_3_GRU/20230116-103150
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
# Meking predictions
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs



array([[4.65900637e-03],
       [7.43340492e-01],
       [9.99801517e-01],
       [1.59227267e-01],
       [2.54200248e-04],
       [9.99493241e-01],
       [6.09049141e-01],
       [9.99903500e-01],
       [9.99839962e-01],
       [5.78665257e-01],
       [8.11867532e-04],
       [8.48473966e-01],
       [6.93586306e-04],
       [1.41044393e-01],
       [2.36695734e-04],
       [8.22120905e-03],
       [2.15882156e-03],
       [6.05901878e-04],
       [5.87642156e-02],
       [9.99629200e-01],
       [9.98125613e-01],
       [1.13471207e-04],
       [9.99618411e-01],
       [7.02270400e-03],
       [9.99822915e-01],
       [9.99863327e-01],
       [5.19790500e-03],
       [2.83036754e-03],
       [1.12335628e-03],
       [3.29441369e-01],
       [9.54154134e-01],
       [5.31938858e-03],
       [6.24567866e-01],
       [4.92135668e-03],
       [5.11452019e-01],
       [1.45058215e-01],
       [9.99590993e-01],
       [3.01710695e-01],
       [4.24492545e-02],
       [9.99869883e-01],


In [55]:
# Converting model 3 pred probs to label format
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [56]:
# Calculate model 3 results
model_3_results = calculate_results(
    y_true=val_labels,
    y_pred=model_3_preds
)
model_3_results

{'accuracy': 77.42782152230971,
 'precision': 0.7757380419380466,
 'recall': 0.7742782152230971,
 'f1': 0.7723566516531356}

### Model 4 - Bidirectional RNN

In [57]:
# Create a model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_bidir")

In [58]:
# Checking model details
model_4.summary()

Model: "model_4_bidir"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,378,945
N

In [59]:
# Compiling the model
model_4.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [60]:
# Fitting the model
history_4 = model_4.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_4_bidir"
    )]
)

Saving TensorBoard log files to: model_logs/model_4_bidir/20230116-103211
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [61]:
# Making predictions
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[3.2413623e-01],
       [6.3701016e-01],
       [9.9989742e-01],
       [1.3428456e-01],
       [1.6744674e-05],
       [9.9727756e-01],
       [8.9310348e-01],
       [9.9995303e-01],
       [9.9972731e-01],
       [9.6627390e-01]], dtype=float32)

In [62]:
# Adjusting model's probs to the label format
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [63]:
# Checking results of predictions
model_4_results = calculate_results(
    y_true=val_labels,
    y_pred=model_4_preds
)
model_4_results

{'accuracy': 77.42782152230971,
 'precision': 0.7745797612274115,
 'recall': 0.7742782152230971,
 'f1': 0.7730386111374632}

### Model 5 - Conv1D

In [64]:
# Testing embedding layer
from tensorflow.keras import layers

embedding_test = embedding(text_vectorizer("This is a test sentence"))
# print(embedding_test.shape)
embedding_test = tf.expand_dims(embedding_test, axis=0)
# print(embedding_test.shape)
conv_1d = layers.Conv1D(
    filters=32,
    kernel_size=5,
    activation="relu",
    padding="valid"
)
conv_1d_output = conv_1d(embedding_test)
max_pool = layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output)

embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))

### Model 5 - 1D Convolutional Network layer

In [65]:
# Creating a model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string, name="input_layer")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(
    filters=64,
    kernel_size=5,
    strides=1,
    activation="relu",
    padding="valid"
)(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_conv1d")

In [66]:
# Checking model details
model_5.summary()

Model: "model_5_conv1d"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 1)              

In [67]:
# Compiling the model
model_5.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [68]:
# Fitting the model
history_5 = model_5.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_5_conv1d"
    )]
)

Saving TensorBoard log files to: model_logs/model_5_conv1d/20230116-103237
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [69]:
# Making predictions
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]



array([[1.89870685e-01],
       [8.83843005e-01],
       [9.99887943e-01],
       [1.09054685e-01],
       [1.31184677e-07],
       [9.96619225e-01],
       [9.56514895e-01],
       [9.99980628e-01],
       [9.99997675e-01],
       [8.88097584e-01]], dtype=float32)

In [70]:
# Converting probs into label format
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [71]:
# Evaluating model's predictions
model_5_results = calculate_results(
    y_true=val_labels,
    y_pred=model_5_preds
)
model_5_results

{'accuracy': 75.7217847769029,
 'precision': 0.7576149234046069,
 'recall': 0.7572178477690289,
 'f1': 0.7555947144839635}

### Model 6 - TensorflowHub pretrained model

In [72]:
# Reusing USE (Universal Sentence Encoder) model
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed_samples = embed([
    sample_sentence,
    "When you apple USE to text, it converts it into numbers"
])
print(embed_samples[0][:50])

tf.Tensor(
[-0.01602832  0.01068848  0.02425467 -0.01405769  0.01434426  0.08292625
  0.01963372  0.06160142 -0.00352702 -0.01216412  0.00978647 -0.01248495
  0.01232345  0.09748451  0.06141113 -0.03728353  0.01860884 -0.04669856
  0.00413913 -0.06363907 -0.02469898  0.02713691  0.02284444 -0.00210026
 -0.00630592 -0.03964961  0.02220404  0.00115077 -0.03132177  0.00119527
 -0.0401255   0.04561892 -0.01530598 -0.00175918  0.02173131 -0.08450424
  0.03340026  0.04604554 -0.0248025  -0.08681665  0.00702694 -0.00770478
 -0.01434539  0.07814164 -0.10676058 -0.05152997 -0.00858155 -0.03232232
 -0.03871097  0.02581467], shape=(50,), dtype=float32)


In [73]:
# Sampling shape
embed_samples[0].shape

TensorShape([512])

In [74]:
# Creating layer with USE pretrained model
sentence_encoder_layer = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder/4",
    input_shape=[],
    dtype=tf.string,
    trainable=False,
    name="USE"
)





In [75]:
# Building a new model with pretrained layer
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    # layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
], name="model_6_use")

In [76]:
# Compiling the model
model_6.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [77]:
# Fitting the model
history_6 = model_6.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_6_use"
    )]
)

Saving TensorBoard log files to: model_logs/model_6_use/20230116-103312
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
# Checking the model details
model_6.summary()

Model: "model_6_use"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_5 (Dense)             (None, 1)                 513       
                                                                 
Total params: 256,798,337
Trainable params: 513
Non-trainable params: 256,797,824
_________________________________________________________________


In [79]:
# Making predictions
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs[:10]



array([[0.37535816],
       [0.68599105],
       [0.858224  ],
       [0.33574876],
       [0.64923936],
       [0.72946644],
       [0.81911504],
       [0.83147806],
       [0.75724036],
       [0.20284742]], dtype=float32)

In [80]:
# Converting prediction probs into label format
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 1., 1., 1., 1., 1., 0.], dtype=float32)>

In [81]:
# Calculating model's performance metrics
model_6_results = calculate_results(
    y_true=val_labels,
    y_pred=model_6_preds
)
model_6_results

{'accuracy': 78.60892388451444,
 'precision': 0.7863431959164349,
 'recall': 0.7860892388451444,
 'f1': 0.7850582651599072}

### Model 7 - Pretrained USE with 10% of training data

In [91]:
## THIS IS WRONG WAY TO SPLIT, because of potential data leak,
##  i.e. some data may end up in validation seubset

# Creating subset of 10% training data
train_data_10p = train_df_shuffled[["text", "target"]].sample(frac=0.1, random_state=42)
train_sentences_10p = train_data_10p["text"].to_list()
train_labels_10p = train_data_10p["target"].to_list()
len(train_sentences_10p), len(train_labels_10p)

(761, 761)

In [92]:
## MAKING A BETTER SPLIT

# Creating 10% subset
train_10p_split = int(0.1 * len(train_sentences))
train_sentences_10p = train_sentences[:train_10p_split]
train_labels_10p = train_labels[:train_10p_split]
len(train_sentences_10p), len(train_labels_10p)

(685, 685)

In [93]:
# Checking the length of original dataset
len(train_df_shuffled)

7613

In [94]:
# Checking the number of targets - 10%
train_data_10p["target"].value_counts()

0    413
1    348
Name: target, dtype: int64

In [95]:
# Checking the number of targets - Full set
train_df_shuffled["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [96]:
# Cloning model 6
model_7 = tf.keras.models.clone_model(model_6)

# Compiling the model
model_7.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

# Checking the model details
model_7.summary()

Model: "model_6_use"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_5 (Dense)             (None, 1)                 513       
                                                                 
Total params: 256,798,337
Trainable params: 513
Non-trainable params: 256,797,824
_________________________________________________________________


In [97]:
# Fitting the model
history_7 = model_7.fit(
    train_sentences_10p,
    train_labels_10p,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_7_use"
    )]
)

Saving TensorBoard log files to: model_logs/model_7_use/20230116-184029
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [98]:
# Making predictions
model_7_pred_probs = model_7.predict(val_sentences)
model_7_pred_probs[:10]



array([[0.46963853],
       [0.5124494 ],
       [0.57981694],
       [0.47460523],
       [0.50114006],
       [0.51902497],
       [0.54269725],
       [0.49833354],
       [0.52545536],
       [0.45071054]], dtype=float32)

In [99]:
# Converting pred probs to label format
model_7_preds = tf.squeeze(tf.round(model_7_pred_probs))
model_7_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 1., 1., 1., 0., 1., 0.], dtype=float32)>

In [100]:
# Evaluating model predictions
model_7_results = calculate_results(
    y_true=val_labels,
    y_pred=model_7_preds
)
model_7_results

{'accuracy': 71.91601049868767,
 'precision': 0.730288919910709,
 'recall': 0.7191601049868767,
 'f1': 0.7101609145220378}