# # Section 10

## NLP Fundamentals in TensorFLow

In [1]:
import tensorflow as tf
print("TensorFlow Version: ", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

TensorFlow Version:  2.9.0
Num GPUs Available:  1


In [2]:
from utils import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

### Load and visualize data

In [3]:
import pandas as pd
train_df = pd.read_csv("../datasets/nlp-getting-started/train.csv")
test_df = pd.read_csv("../datasets/nlp-getting-started/test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df = train_df.sample(frac=1, random_state=42)
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df["text"].to_numpy(),
                                                                            train_df["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)
len(train_sentences), len(val_sentences)

(6851, 762)

In [6]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [7]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [8]:
from tensorflow.keras.layers import TextVectorization

# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [9]:
text_vectorizer.adapt(train_sentences)

In [10]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [11]:
# Get the unique words in vocab:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_word = words_in_vocab[:5]
bottom_5_word = words_in_vocab[-5:]
print(len(words_in_vocab),)
print(top_5_word)
print(bottom_5_word)

10000
['', '[UNK]', 'the', 'a', 'in']
['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Create an Embedding using an Embedding Layer

In [12]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding

<keras.layers.core.embedding.Embedding at 0x23bc769ad90>

In [13]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
 #Turkish couple decided to feed 4000 #Syrian #refugees as part of their #wedding celebrations http://t.co/EHLq3ZSPTd http://t.co/DjX5eLbrv1      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.03760554,  0.00534957, -0.01924459, ...,  0.01344767,
         -0.00310113, -0.00246859],
        [-0.03080246,  0.00241169,  0.01142901, ...,  0.00113864,
          0.01700186, -0.01691245],
        [-0.01705037,  0.04608745, -0.01928799, ...,  0.04403783,
          0.03297207,  0.01617283],
        ...,
        [-0.04622445,  0.03091918, -0.00319333, ..., -0.01852305,
          0.00046947, -0.04913305],
        [-0.01067124, -0.03106203,  0.00611861, ...,  0.01351634,
         -0.00092933,  0.00196908],
        [ 0.02833349, -0.03605727, -0.04865879, ...,  0.048621  ,
         -0.04865059,  0.03496059]]], dtype=float32)>

### Model 0: **Naive Bayes**

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [15]:
# Create tokenization and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB()),
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [16]:
# Evaluate the baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Accuracy score: {baseline_score*100:.2f}%")

Accuracy score: 79.27%


In [17]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [18]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [19]:
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

### Model 1: **Feed-forward neural network**

In [20]:
# Create a tensorboard callback (need to create a new one for each model)
from utils import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [28]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string) # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numberized inputs
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layers, sigmoid activation function for binary outputs

model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")


In [29]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [30]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [31]:
# Fit the model
model_1_history = model_1.fit(x=train_sentences, 
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_1_dense")])

Saving TensorBoard log files to: model_logs/model_1_dense/20230402-221013
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
model_1.evaluate(val_sentences, val_labels)



[0.4823460280895233, 0.7860892415046692]

In [36]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape, model_1_pred_probs[0]



((762, 1), array([0.3708497], dtype=float32))

In [39]:
# Convert model prediction probabilities to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [41]:
# Calculate our model_1 results
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 78.60892388451444,
 'precision': 0.7911704681403405,
 'recall': 0.7860892388451444,
 'f1': 0.7830068921982195}

### Visualize learned embeddings

In [45]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [42]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [44]:
# Get the weight matrix of embedding layer
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights

array([[-0.00256008, -0.00530154, -0.00143902, ...,  0.0234034 ,
         0.00812573,  0.03349238],
       [ 0.0335928 , -0.04052846, -0.05375193, ...,  0.04315491,
        -0.04300399,  0.03132113],
       [ 0.04754218, -0.00278127, -0.05466916, ..., -0.02530761,
        -0.02235422,  0.00771035],
       ...,
       [ 0.04117909, -0.00868917,  0.02117223, ..., -0.03969456,
        -0.02188742, -0.04694946],
       [ 0.01847334, -0.03556723, -0.0453844 , ..., -0.04439948,
         0.07945544, -0.0660545 ],
       [ 0.11018836, -0.03468871, -0.11180779, ..., -0.02986466,
         0.06462635, -0.08781251]], dtype=float32)

In [46]:
# Create embedding files
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()


### Model 2: **LSTM model (RNN)**

### Model 3: **GRU model (RNN)**

### Model 4: **Bidirectional-LSTM model (RNN)**

### Model 5: **1D Convolutional Neural Network (CNN)**

### Model 6: **TensorFlow Hub Pretrained Feature Extraction**
(Using transfer learning for NLP)

### Model 7: **Model 6 with 10% of training data**