# TensorFlow Natural Language Processing

NLP has the goal of deriving information out of a language data

## Get helper functions

In [1]:
# Import series helper functions for the notebook
from helper_functions import create_tensorboard_callback, plot_loss_curves, compare_historys

## Get a test dataset

The dataset we're going to be using is Kaggle's introduction to NLP dataset

## Becoming one with the data

Visualizing a text dataset

In [2]:
import pandas as pd

In [4]:
train_df = pd.read_csv('data/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# Shuffle training dataframe
shuffled_train_df = train_df.sample(frac=1)
shuffled_train_df

Unnamed: 0,id,keyword,location,text,target
268,389,annihilation,,THANKS!!!!! @COUNT DANTE. :) DO JOIN US BY F...,0
6266,8953,storm,mind ya business,@Jenniferarri_ comeeeee! ...but why is it bout...,1
5156,7355,obliterate,United Kingdom,@klavierstuk doesn't so LVG is forced into the...,0
3738,5313,fire,,Morganite Gemstone White Fire Opal 925 Sterlin...,0
1066,1538,bomb,keli x,HALSEY AND TROYE COLLAB WOULD BE BOMB,0
...,...,...,...,...,...
1756,2527,collision,"SEATTLE, WA USA",On I-405 southbound at Coal Creek Pkwy there i...,1
6837,9794,trapped,,Hollywood Movie About Trapped Miners Released ...,1
3360,4810,evacuated,"Gold Coast, Australia",Tram travellers evacuated after powerlines com...,1
3638,5187,fatalities,"Youngstown, OH",OSP concerned about mounting fatalities http:/...,1


In [11]:
# What does the test dataframe look like
test_df = pd.read_csv('data/test.csv')
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [12]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [13]:
# How many total samples?
len(train_df)

7613

## Split data into training and validation sets

we will use `sklearn.model_selection.train_test_split`

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    shuffled_train_df['text'].to_numpy(),
    shuffled_train_df['target'].to_numpy(),
    test_size = 0.1
)

In [16]:
train_sentences

array(['Remembering Pittsburgh Eyewitness History of Steel City by Len Barcousky PB Penn http://t.co/dhGAVw8bSW http://t.co/0lMhEAEX9k',
       'master0fsloths has a crush: http://t.co/SZX6v0bbjF',
       'I feel like a tornado http://t.co/iZJK6kpWiZ', ...,
       'Ways so archetype a bleeding well-grounded readiness: FpOJ http://t.co/WXbrArc7p3',
       '@suelinflower there is no words to describe the physical painthey ripped you apart while you screamed for dear lifeits like been engulfed',
       'Repulsive! Refugees-Victimiser-#Dutton Evangelical-Liar-#Abbott c/o #LNP on a dupe the press overdrive; #CHOPPERGATE!#BRONWYNBISHOP!#AUSPOL'],
      dtype=object)

In [17]:
train_labels

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

## Converting text to numbers

When dealing with a text problem, one of the first things you'll have to do before you can build a model is to convert text into numbers namely:
* Tokenization
* Embedding

In [19]:
import tensorflow as tf

In [23]:
from tensorflow.keras.layers import TextVectorization

# Use the default TextVectorization parameters
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize='lower_and_strip_punctuation',
                                    split='whitespace',
                                    ngrams=None,
                                    output_mode='int',
                                    output_sequence_length=None,
                                    #pad_to_max_tokens=True
                                   )

In [24]:
# Setup text vectorization variables
max_vocab_length = 1000 # Max number of words to have in our vocabulary
max_length = 15 # max length our sequences wil be
text_vect = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode='int',
    output_sequence_length = max_length
)

In [26]:
# Fit the text vectorizer to the training text
text_vect.adapt(train_sentences)

In [29]:
# Create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vect([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[275,   3, 217,   4,  13, 762,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [32]:
# Choose a random sentence from the training dataset and tokenize it
import random
random_sentence = random.choice(train_sentences)
print(f'Original tex\n{random_sentence}\n\nVactorized version:{text_vect([random_sentence])}')

Original tex
? High Skies - Burning Buildings ? http://t.co/uVq41i3Kx2 #nowplaying

Vactorized version:[[320   1  90  98   1 685   0   0   0   0   0   0   0   0   0]]


In [35]:
# Get all of the unique words in the vocabulary
words_in_vocab = text_vect.get_vocabulary()
top_5_most_used_words = words_in_vocab[:5]
print(f'Number of words in vocab: {len(words_in_vocab)}')
print(f'5 most common words: {top_5_most_used_words}')

Number of words in vocab: 1000
5 most common words: ['', '[UNK]', 'the', 'a', 'in']


In [71]:
## Creating and embedding layer
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length, output_dim=128, input_length=max_length)

embedding(text_vect([random.choice(train_sentences)]))

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.00647839, -0.0470009 , -0.04536194, ..., -0.01928971,
         -0.02454203,  0.04426506],
        [ 0.04635367, -0.01618565, -0.02664793, ..., -0.004468  ,
         -0.02035745, -0.03338497],
        [ 0.00647839, -0.0470009 , -0.04536194, ..., -0.01928971,
         -0.02454203,  0.04426506],
        ...,
        [-0.01785592, -0.01576645,  0.02917823, ...,  0.00204301,
          0.00384064,  0.04561888],
        [-0.01785592, -0.01576645,  0.02917823, ...,  0.00204301,
          0.00384064,  0.04561888],
        [-0.01785592, -0.01576645,  0.02917823, ...,  0.00204301,
          0.00384064,  0.04561888]]], dtype=float32)>

## Modelling a text dataset
Now we've got way to turn our text sequences into numbers, it's time to start building a series of modelling experiments.

We'll start with a baseline and move on from there.

* Model 0: Naive Bayes (baseline), this is a form of Sklearn ML
* Model 1: Feed-forward neural network (dense model)
* Model 2: LSTM model (RNN)
* Model 3: GRU model (RNN)
* Model 4: Bidirectional_LSTM model (RNN)
* Model 5: 1D Convolutional Neural Network (CNN)
* Model 6: TensorFlow Hub Pretrained Feature Extractor (using transfer learning for NLP)
* Model 7: Same as model 6 with 10% of training data

### Model 1: Feed-Forward Neural Network

In [72]:
# Create the model
model1 = tf.keras.Sequential([
    text_vect,
    tf.keras.layers.Embedding(
        max_vocab_length,
        output_dim= 128,
    ),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [73]:
# Compile the model
model1.compile(
    loss = 'binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [74]:
# Get summary
model1.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_6 (Embedding)     (None, 15, 128)           128000    
                                                                 
 global_average_pooling1d_2  (None, 128)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_6 (Dense)             (None, 1)                 129       
                                                                 
Total params: 128129 (500.50 KB)
Trainable params: 128129 (500.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [75]:
# fit the model
model1.fit(
    x = train_sentences,
    y = train_labels,
    epochs = 10,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22319ed72d0>

In [57]:
result1 = model1.layers[0](train_sentences)
result1

<tf.Tensor: shape=(6851, 15), dtype=int64, numpy=
array([[  1,   1, 580, ...,   1,   1,   0],
       [  1,  42,   3, ...,   0,   0,   0],
       [  8, 227,  25, ...,   0,   0,   0],
       ...,
       [  1,  28,   1, ...,   0,   0,   0],
       [  1,  73,   9, ...,   1, 206,  12],
       [  1,   1,   1, ...,   0,   0,   0]], dtype=int64)>

In [61]:
result3 = model1.layers[2](result2)
result3

<tf.Tensor: shape=(6851, 15, 1), dtype=float32, numpy=
array([[[0.4879135 ],
        [0.4879135 ],
        [0.5212862 ],
        ...,
        [0.4879135 ],
        [0.4879135 ],
        [0.5169261 ]],

       [[0.4879135 ],
        [0.48872462],
        [0.4961161 ],
        ...,
        [0.5169261 ],
        [0.5169261 ],
        [0.5169261 ]],

       [[0.49710265],
        [0.501212  ],
        [0.47803405],
        ...,
        [0.5169261 ],
        [0.5169261 ],
        [0.5169261 ]],

       ...,

       [[0.4879135 ],
        [0.5014943 ],
        [0.4879135 ],
        ...,
        [0.5169261 ],
        [0.5169261 ],
        [0.5169261 ]],

       [[0.4879135 ],
        [0.49291182],
        [0.49931878],
        ...,
        [0.4879135 ],
        [0.50083894],
        [0.5027638 ]],

       [[0.4879135 ],
        [0.4879135 ],
        [0.4879135 ],
        ...,
        [0.5169261 ],
        [0.5169261 ],
        [0.5169261 ]]], dtype=float32)>

## Model 2: LSTM

In [99]:
# Create the model
model2 = tf.keras.Sequential([
    text_vect,
    tf.keras.layers.Embedding(
        input_dim = max_vocab_length,
        output_dim = 128
    ),
    tf.keras.layers.LSTM(units=64, return_sequences=True),
    tf.keras.layers.LSTM(units=64),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [100]:
# Get summary
model2.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_10 (Embedding)    (None, 15, 128)           128000    
                                                                 
 lstm_2 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 210497 (822.25 KB)
Trainable params: 210497 (822.25 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [101]:
# Compile the model
model2.compile(
    loss='binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [102]:
# Fit the model
history2 = model2.fit(
    x = train_sentences,
    y = train_labels,
    epochs=10,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Visualize our model with https://projector.tensorflow.org

In [89]:
weights = model2.get_layer('embedding_8').get_weights()[0]
vocab = text_vect.get_vocabulary()

In [90]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

## Model 3: Using GRU

In [105]:
# Create the model
model3 = tf.keras.Sequential([
    text_vect,
    tf.keras.layers.Embedding(
        input_dim = max_vocab_length,
        output_dim = 128
    ),
    tf.keras.layers.GRU(64, return_sequences=True),
    tf.keras.layers.GRU(64),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [106]:
# Get summary
model3.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_12 (Embedding)    (None, 15, 128)           128000    
                                                                 
 gru_1 (GRU)                 (None, 15, 64)            37248     
                                                                 
 gru_2 (GRU)                 (None, 64)                24960     
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 190273 (743.25 KB)
Trainable params: 190273 (743.25 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [107]:
# Compile the model
model3.compile(
    loss='binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [108]:
# Fit the model
history3 = model3.fit(
    x = train_sentences,
    y = train_labels,
    epochs=10,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Model 3: Using GRU

In [105]:
# Create the model
model3 = tf.keras.Sequential([
    text_vect,
    tf.keras.layers.Embedding(
        input_dim = max_vocab_length,
        output_dim = 128
    ),
    tf.keras.layers.GRU(64, return_sequences=True),
    tf.keras.layers.GRU(64),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [106]:
# Get summary
model3.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_12 (Embedding)    (None, 15, 128)           128000    
                                                                 
 gru_1 (GRU)                 (None, 15, 64)            37248     
                                                                 
 gru_2 (GRU)                 (None, 64)                24960     
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 190273 (743.25 KB)
Trainable params: 190273 (743.25 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [107]:
# Compile the model
model3.compile(
    loss='binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [108]:
# Fit the model
history3 = model3.fit(
    x = train_sentences,
    y = train_labels,
    epochs=10,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Model 4: Bidirectional LSTM x GRU

In [117]:
# Create the model
model4 = tf.keras.Sequential([
    text_vect,
    tf.keras.layers.Embedding(
        input_dim = max_vocab_length,
        output_dim=128
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(units=64, return_sequences=True)
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(units=64)
    ),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [118]:
model4.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_15 (Embedding)    (None, 15, 128)           128000    
                                                                 
 bidirectional_4 (Bidirecti  (None, 15, 128)           98816     
 onal)                                                           
                                                                 
 bidirectional_5 (Bidirecti  (None, 128)               74496     
 onal)                                                           
                                                                 
 dense_14 (Dense)            (None, 1)                 129       
                                                     

In [119]:
# Compile the model
model4.compile(
    loss='binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [120]:
# Fit the model
history4 = model4.fit(
    x = train_sentences,
    y = train_labels,
    epochs=10,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Model 5: CNN1D

In [125]:
# Create the model
model5 = tf.keras.Sequential([
    text_vect,
    tf.keras.layers.Embedding(
        input_dim = max_vocab_length,
        output_dim=128
    ),
    tf.keras.layers.Conv1D(10, 3),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [126]:
model5.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_18 (Embedding)    (None, 15, 128)           128000    
                                                                 
 conv1d_2 (Conv1D)           (None, 13, 10)            3850      
                                                                 
 global_average_pooling1d_3  (None, 10)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_17 (Dense)            (None, 1)                 11        
                                                                 
Total params: 131861 (515.08 KB)
Trainable params: 13

In [127]:
# Compile the model
model5.compile(
    loss='binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [129]:
# Fit the model
history5 = model5.fit(
    x = train_sentences,
    y = train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Transfer Learning Feature Extractions

In [None]:
import tensorflow_hub as hub
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [None]:
# Create the model
model6 = tf.keras.Sequential([
    hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                  input_shape=[],
                  dtype=tf.string,
                  trainable=False),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model
model6.compile(
    loss='binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [None]:
# Fit the model
history6 = model6.fit(
    x = train_sentences,
    y = train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)