# Natural Language Processing

In [1]:
# Importing TF and checking the version
import tensorflow as tf

print(tf.__version__)

2.10.1


In [2]:
# Importing helper functions
from DanielBourke_HelperFunctions import create_tensorboard_callback, plot_loss_curves, compare_historys

### Analysing text dataset

In [3]:
# Loading data
import pandas as pd

train_df = pd.read_csv("NLP_text/train.csv")
test_df = pd.read_csv("NLP_text/test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Shuffling training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
# Checking test dataframe
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# Checking number of training records
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
# Checking total number of samples
len(train_df), len(test_df)

(7613, 3263)

In [8]:
# Visualising random samples
import random

random_index = random.randint(0, len(train_df) - 5)

for row in train_df_shuffled[["text", "target"]][random_index : random_index + 5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}")
    print("---")

Target: 1 (real disaster)
Text:
RT twit_san_diego 'Possible sinkhole disrupts trolley service: A depression in a portion of asphalt in downtown SaÛ_ http://t.co/ANrIOMbHQN'
---
Target: 1 (real disaster)
Text:
Fukushima: The Story of a Nuclear Disaster http://t.co/ikpnGs3dTi http://t.co/lJHgSdRAEZ
---
Target: 1 (real disaster)
Text:
Today (August 6th) is the 70th anniversary of A-Bomb 'Little Boy' been dropped on Hiroshima.70000 killed outright as the city was flattened
---
Target: 1 (real disaster)
Text:
ÛÏRichmond Coaches were devastated to hear of the death of their second driver Mr Chance who was sittingÛ_: Jam... http://t.co/y5Yhbb0hkf
---
Target: 1 (real disaster)
Text:
Legionnaires' Disease: What's Being Done to Stop Deadly Outbreak: The worst-ever outbreak of Legio... http://t.co/0ubG9wFyge #losangeles
---


### Creating validation data

In [9]:
# Splitting dataset
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size=0.1, # Allocating 10% to validation data
    random_state=42
)

In [10]:
# Checking dataset length
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [11]:
# Checking the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

### Converting text to numbers

In [12]:
# Using text vectorisation
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=False
)

In [13]:
# Finding the average number of tokens
round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))

15

In [14]:
# Setting up text vectorisation variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)

In [15]:
# Fitting text vectorisation to the training dataset
text_vectorizer.adapt(train_sentences)

In [16]:
# Create a sample sentnence and tokenise it
sample_sentence = "There is a flood in my street"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [17]:
# Tokenising random sentence from the training set
random_sentence = random.choice(train_sentences)
print(
    f"Original text:\n{random_sentence},\ntokenised version:\n{text_vectorizer([random_sentence])}"
)

Original text:
Thankkk U all Soo much for Flooding my NotificationsU my Fella ParShOlics r Superb &amp; Jus soo awesomeLove Love U All always #FrvrGrateful ??,
tokenised version:
[[7688  142   44 4573  187   10  231   13    1   13 5656 9937  996 7928
    35]]


In [18]:
# Getting unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

len(words_in_vocab), top_5_words, bottom_5_words

(10000,
 ['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

### Creating Embedding layer

In [19]:
# Defining the layer
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length
)
embedding

<keras.layers.core.embedding.Embedding at 0x2af12285760>

In [20]:
# Get a random sentence
random_sentence = random.choice(train_sentences)
print(
    f"Original text:\n{random_sentence},\nembedded version:"
)

# Embed random sentence
embed_sentence = embedding(text_vectorizer([random_sentence]))
embed_sentence

Original text:
Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her
 
#FARRAKHAN #QUOTE,
embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.00608536, -0.00503293, -0.02216567, ...,  0.0098791 ,
          0.01424113,  0.02051328],
        [-0.04540942, -0.00792285, -0.03182157, ...,  0.00136117,
          0.01096286, -0.03136213],
        [ 0.03209788,  0.03825185,  0.03018017, ..., -0.045817  ,
          0.00730542, -0.00556413],
        ...,
        [-0.00711836,  0.00727273, -0.02788265, ..., -0.04154006,
          0.01462967, -0.01049597],
        [-0.04205652,  0.00976213,  0.02948016, ...,  0.04375932,
          0.00569487, -0.00429461],
        [-0.04540942, -0.00792285, -0.03182157, ...,  0.00136117,
          0.01096286, -0.03136213]]], dtype=float32)>

In [21]:
# Checking single token's embedding
embed_sentence[0][0], embed_sentence[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 0.00608536, -0.00503293, -0.02216567, -0.02680713, -0.00641276,
         0.01329494,  0.0266765 , -0.02425584, -0.00162367, -0.03157058,
         0.02839017, -0.04271286, -0.01867538, -0.00430597,  0.04000229,
        -0.04060602, -0.0223161 , -0.01322109, -0.01011483, -0.00418919,
        -0.01454941,  0.04685266, -0.02425578, -0.04425795, -0.02262899,
         0.0468311 , -0.01057034, -0.03612819,  0.00404163,  0.00798503,
         0.00630431,  0.03340546,  0.00674457,  0.03875986, -0.01270526,
        -0.02588558,  0.04607414,  0.00977917,  0.02422232,  0.02819394,
        -0.00213978, -0.01924468,  0.01634521,  0.03741642, -0.0004102 ,
        -0.02740305, -0.03851508,  0.00550057, -0.04975512,  0.04109016,
         0.03859312, -0.04802132, -0.02959552,  0.00598424, -0.00128083,
        -0.04303014,  0.04797379,  0.01304528,  0.0491508 , -0.0334924 ,
         0.00263034, -0.04045767,  0.0016011 ,  0.04986833,  0.04733514,
  

### Building base model

In [22]:
# Using SKLearn to build base model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenisation and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # Convert words to numbers
    ("clf", MultinomialNB()) # Model the text
])

# Fit the pipeline to training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [23]:
# Evaluating baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Achieved accuracy: {baseline_score * 100:.2f}%")

Achieved accuracy: 79.27%


In [24]:
# Making predictions
baseline_predictions = model_0.predict(val_sentences)
baseline_predictions[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

### Function to evaluate model performance

In [25]:
# Importing SKLearn functions
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Developing function to evaluate accuracy, precision, recall and F1 scor
def calculate_results(y_true, y_pred):
    """
    Evaluate binary classification model
    """
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and F1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1
    }
    return model_results

In [26]:
# Getting baseline results
baseline_results = calculate_results(
    y_true=val_labels,
    y_pred=baseline_predictions
)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

### Model 1: a simple dense model

In [27]:
# Creating tensorboard callback
from DanielBourke_HelperFunctions import create_tensorboard_callback

# Creating log directory
SAVE_DIR = "model_logs"

In [40]:
# Building model with Functional API
from tensorflow.keras import layers

inputs = layers.Input(shape=(1, ), dtype=tf.string)
x = text_vectorizer(inputs) # Turn the input text into numbers
x = embedding(x) # Create embedding of numberised inputs
x = layers.GlobalAveragePooling1D()(x) # Condence the feature vector
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [41]:
# Getting model summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [42]:
# Compiling model
model_1.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [43]:
# Fitting the model
history_1 = model_1.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_1_dense"
    )]
)

Saving TensorBoard log files to: model_logs/model_1_dense/20230111-181054
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [44]:
# Checking the results
model_1.evaluate(val_sentences, val_labels)



[0.4776519238948822, 0.7795275449752808]

In [45]:
# Making predictions
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape



(762, 1)

In [47]:
# First 10 predictions
model_1_pred_probs[:10]

array([[0.3747982 ],
       [0.74696624],
       [0.9968072 ],
       [0.16036092],
       [0.12743133],
       [0.93404263],
       [0.89537424],
       [0.99109286],
       [0.9682489 ],
       [0.2974318 ]], dtype=float32)

In [48]:
# Converting model predictions to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [49]:
# Calculating model 1 results
model_1_results = calculate_results(
    y_true=val_labels,
    y_pred=model_1_preds
)
model_1_results

{'accuracy': 77.95275590551181,
 'precision': 0.7822644211580037,
 'recall': 0.7795275590551181,
 'f1': 0.7771404562571971}

In [50]:
# Retrieving baseline results
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [51]:
# Comparing model results
import numpy as np

np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

### Visualising learned embeddings

In [52]:
# Getting vocabulary from the text vectorisation layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [53]:
# Checking model 1 details
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [57]:
# Getting the weights
embed_weights = model_1.get_layer("embedding").get_weights()
embed_weights, embed_weights[0].shape

([array([[-0.04355889,  0.07691865, -0.0752024 , ...,  0.03945495,
          -0.0449415 , -0.02386122],
         [-0.02140341,  0.02942503,  0.02315108, ...,  0.0652633 ,
          -0.03192571,  0.0557309 ],
         [-0.03241491,  0.01681036, -0.03956967, ..., -0.0030516 ,
           0.05613579,  0.03536177],
         ...,
         [-0.04974708,  0.04461678, -0.04555272, ...,  0.03378544,
          -0.03652395, -0.02117385],
         [ 0.04074499,  0.08140358,  0.01490805, ..., -0.0390877 ,
          -0.06289218,  0.07332011],
         [-0.14592956,  0.04438657, -0.14998132, ..., -0.07840644,
          -0.1561044 ,  0.15838961]], dtype=float32)],
 (10000, 128))

In [61]:
# Creating embedding files (sample from TF Word Embedding documentation)
import io

out_v = io.open("vectors.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(words_in_vocab):
    if index == 0:
        continue # Skip 0, as it's a padding
    vec = embed_weights[0][index]
    out_v.write("\t".join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")

out_v.close()
out_m.close()

Generated vectors.tsv and metadata.tsv can be uploaded into Tensorflow Embedding Projector tool at https://projector.tensorflow.org/ to visualise weights in 3D space.

### Model 2 - RNN (Recurrent Neural Network) with LSTM (Long Short-Term Memory)

In [66]:
# CReating an RNN model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
# x = layers.LSTM(units=64, return_sequences=True)(x) # Sequences required when you stack LSTM layers
# print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
# x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)


In [67]:
# Checking model's summary
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [68]:
# Compiling the model
model_2.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [69]:
# Fittting the model
history_2 = model_2.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[create_tensorboard_callback(
        dir_name=SAVE_DIR,
        experiment_name="model_2_LSTM"
    )]
)

Saving TensorBoard log files to: model_logs/model_2_LSTM/20230111-211105
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [70]:
# Meking predictions
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[4.0355236e-03],
       [5.2466840e-01],
       [9.9978209e-01],
       [6.3779508e-03],
       [2.4754304e-04],
       [9.9986190e-01],
       [9.3356472e-01],
       [9.9992144e-01],
       [9.9989933e-01],
       [8.0523074e-02]], dtype=float32)

In [71]:
# Converting pred probs to label format
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [73]:
#Calculate model 2 results
model_2_results = calculate_results(
    y_true=val_labels,
    y_pred=model_2_preds
)
model_2_results

{'accuracy': 76.9028871391076,
 'precision': 0.7697342660172047,
 'recall': 0.7690288713910761,
 'f1': 0.7674269881907728}

### Model 3 - RNN with GRU (gated recurrent unit)