In [1]:
# Check GPU
!nvidia-smi -L

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [2]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [3]:
## Get text dataset
## Kaggle's introduction to NLP dataset
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

#Unzip data
unzip_data('nlp_getting_started.zip')

--2021-11-19 22:12:24--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.1.128, 108.177.121.128, 142.250.103.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.1.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2021-11-19 22:12:24 (97.3 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [4]:
import pandas as pd
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [5]:
train_df.shape, test_df.shape

((7613, 5), (3263, 4))

In [6]:
train_df['text'][1]

'Forest fire near La Ronge Sask. Canada'

In [7]:
# shuffle training data
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [8]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [9]:
train_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [10]:
# number of samples
len(train_df), len(test_df)

(7613, 3263)

In [11]:
# visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[['text', 'target']][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f'Target: {target}', "(real diaster)" if target >0 else "(not real diaster)")
  print(f'Text:\n{text}\n')
  print('---\n')

Target: 0 (not real diaster)
Text:
I just collapsed in my bed ugh I'm exhausted

---

Target: 1 (real diaster)
Text:
US Navy Sidelines 3 Newest Subs - http://t.co/guvTIzyCHE: DefenseNews.comUS Navy Sidelines 3 Newest SubsD... http://t.co/SY2WhXT0K5 #navy

---

Target: 0 (not real diaster)
Text:
Nuclear reactor railguns would be a great way to deliver t1000s.

---

Target: 0 (not real diaster)
Text:
Russian #ushanka #winter #military fur hat (xl61-62) with soviet badge LINK:
http://t.co/74YFQxvAK0 http://t.co/KXrEHVt6hL

---

Target: 1 (real diaster)
Text:
#World #News Qld police wrap Billy Gordon investigation: QUEENSLAND Police have wrapped up their investigation...  http://t.co/msgnNDxOeK

---



In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [14]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [15]:
# first ten sentences
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

In [16]:
# Convert text into numbers
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [17]:
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize = "lower_and_strip_punctuation",
                                    split = 'whitespace',
                                    ngrams = None,
                                    output_mode = "int",
                                    output_sequence_length=None,
                                    pad_to_max_tokens = False)

In [18]:
# Find the average number of tokens in the training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [19]:
# Setup text vectorization variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode = 'int',
                                    output_sequence_length=max_length)

In [20]:
# Fit the text vectorizer to the training set
text_vectorizer.adapt(train_sentences)

In [21]:
# Create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [22]:
# random sentece from training set
random_sentence = random.choice(train_sentences)
print(f'Original text:\n {random_sentence}\
      \n\n Vectorized version:')
text_vectorizer([random_sentence])

Original text:
 @BabySweet420 I'm mad 420 in your name &amp; you don't blaze.      

 Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  1,  32, 942,   1,   4,  33, 735,  35,  12,  63, 749,   0,   0,
          0,   0]])>

In [23]:
# number of unique word in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f'Number of words in vocab: {len(words_in_vocab)}')
print(f'5 Most common words: {top_5_words}')
print(f'5 least common words: {bottom_5_words}')

Number of words in vocab: 10000
5 Most common words: ['', '[UNK]', 'the', 'a', 'in']
5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [24]:
# Create embedding layer
from tensorflow.keras import layers
embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim=128,
                             input_length = max_length
                             )


In [25]:
# Get a random sentence from the taining set
random_sentence = random.choice(train_sentences)
print(f'Original text: \n {random_sentence}\
      \n \n Embedded version:')

# Embed a random sentence
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text: 
 Micom 2015 Summer Contrast Candy Color Bowknot Cross Body Tote Shoulder Bags for Womengirls with Micom Zip Po http://t.co/sQMTKKJiMJ      
 
 Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01903695, -0.01161213,  0.00608771, ..., -0.03312021,
          0.04975914, -0.03074349],
        [ 0.002591  ,  0.02098775,  0.0319727 , ..., -0.01221076,
          0.01531209,  0.04174173],
        [ 0.04276116,  0.0265025 ,  0.04668114, ...,  0.00769026,
          0.03456128,  0.04704257],
        ...,
        [-0.04089712, -0.04399662, -0.04070786, ...,  0.02465205,
         -0.0343545 , -0.0115487 ],
        [-0.0220811 , -0.02680372,  0.01468242, ...,  0.02761325,
         -0.04141921, -0.01722528],
        [ 0.03227765,  0.00489802,  0.03134326, ..., -0.0315999 ,
         -0.01090459, -0.0398368 ]]], dtype=float32)>

In [26]:
# single token's embedding
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.01903695, -0.01161213,  0.00608771,  0.01744797,  0.00083534,
        -0.00267769, -0.00678248,  0.02209315,  0.03798189,  0.03599271,
        -0.00293721,  0.02305632, -0.00978351, -0.02220939, -0.00500274,
        -0.04456649, -0.03200245,  0.00646915,  0.04486578,  0.01987005,
        -0.04207226,  0.01966241, -0.0037348 ,  0.00283623,  0.02139255,
        -0.0380748 ,  0.02955062,  0.04603736,  0.04650393, -0.03132568,
        -0.039582  , -0.00255871,  0.01754444, -0.02823217,  0.04902979,
         0.00180085,  0.02127345,  0.04564751, -0.01971078,  0.01144988,
        -0.03670366,  0.01736697,  0.00052469,  0.00453528, -0.01762826,
        -0.02019324, -0.04032677,  0.03185601, -0.03832394, -0.01147114,
        -0.04899549,  0.01984357,  0.01831912,  0.01657324,  0.03009688,
         0.01606965, -0.02097012, -0.01314334, -0.00752567, -0.02179955,
         0.00181774, -0.02823379, -0.02853388, -0.03562421, -0.01010022,
  

In [27]:
# Model 0 (Baseline model)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()),
                    ("clf", MultinomialNB()),
])

# Fit pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [28]:
# Evaluate model
baseline_score = model_0.score(val_sentences, val_labels)
print(f'Baseline model has accuracy of: {baseline_score*100:2f}%')

Baseline model has accuracy of: 79.265092%


In [29]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)

In [30]:
baseline_preds[:10]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0])

In [31]:
# create an evaluation function for modelling experiments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):

  """
  Calculates model accuracy, precision, recall and f1-score for binary classification model.

  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred)*100

  # Calculate model precision, recall and f1-score using wieghted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
  model_results = {"accuracy": model_accuracy,
                   "precision": model_precision,
                   "recall": model_recall,
                   "f1": model_f1}
  return model_results

In [32]:
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred = baseline_preds)

baseline_results

{'accuracy': 79.26509186351706,
 'f1': 0.7862189758049549,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706}

In [33]:
# Model_1: A simple dense model
# Create a tensorboard callback
from helper_functions import create_tensorboard_callback

# Ctreate a directory to save a TensorBoard logs
SAVE_DIR = "model_logs"

In [34]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding 
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

In [35]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [36]:
# Compile model
model_1.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [37]:
# Fit the model
model_1_history = model_1.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR, 
                                                                     experiment_name="simple_dense_model")])

Saving TensorBoard log files to: model_logs/simple_dense_model/20211119-221226
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
model_1.evaluate(val_sentences, val_labels)



[0.4800243079662323, 0.7755905389785767]

In [39]:
model_1_pred_probs = model_1.predict(val_sentences)

In [40]:
model_1_pred_probs[0]

array([0.3962436], dtype=float32)

In [41]:
# Convert model predictions probabilities to labels
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [42]:
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred = model_1_preds)

model_1_results

{'accuracy': 77.55905511811024,
 'f1': 0.7725109164504043,
 'precision': 0.7797991425654591,
 'recall': 0.7755905511811023}

In [43]:
baseline_results

{'accuracy': 79.26509186351706,
 'f1': 0.7862189758049549,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706}

In [44]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

In [45]:
# Get the vocabulary from the text vectorization
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [46]:
# Model 1 summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [48]:
# Get the weight matrix of embedding layer 
# (these are the numerical patterns between the text in the training dataset the model has learned)
embed_weights = model_1.get_layer("embedding").get_weights()[0]
print(embed_weights.shape) # same size as vocab size and embedding_dim (each word is a embedding_dim size vector)

(10000, 128)


In [49]:
print(embed_weights[0].shape)

(128,)


In [50]:
# Create embedding files
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [51]:
#try:
# from google.colab import files
#  files.download('vectors.tsv')
#  files.download('metadata.tsv')
#except Exception:
#  pass

In [52]:
# Create an LSTM model
from tensorflow.keras import layers

input = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
#print(x.shape)
#x = layers.LSTM(64, return_sequences=True)(x) # for stacking lSTMs
#print(x.shape)
x = layers.LSTM(64)(x)
#print(x.shape)
#x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs, outputs, name='model_2_LSTM')

In [53]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [54]:
# Compile the model
model_2.compile(loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [55]:
# Fit the model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data=(val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                                       'model_2_LSTM')])

Saving TensorBoard log files to: model_logs/model_2_LSTM/20211119-221304
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [56]:
# Make predictions with the LSTM model
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]

array([[4.8045516e-03],
       [7.2051704e-01],
       [9.9981827e-01],
       [1.5429348e-02],
       [7.6025724e-04],
       [9.9881345e-01],
       [9.8124582e-01],
       [9.9985307e-01],
       [9.9975646e-01],
       [6.0078174e-01]], dtype=float32)

In [57]:
# Convert prediction probabilities to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [58]:
# Calculate model 2 results
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred = model_2_preds)

model_2_results

{'accuracy': 77.55905511811024,
 'f1': 0.7743062301518678,
 'precision': 0.7759863909628747,
 'recall': 0.7755905511811023}

In [59]:
baseline_results

{'accuracy': 79.26509186351706,
 'f1': 0.7862189758049549,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706}

In [60]:
## model 3: GRU (gated recurrent unit)

from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x)
#print(x.shape)
#x = layers.LSTM(64, return_sequences=True)(x)
#print(x.shape)
#x = layers.GRU(64)(x)
#print(x.shape)
#x = layers.Dense(64, activation='relu')(x)
#x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs, name='model_3_GRU')

In [61]:
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [62]:
model_3.compile(loss='binary_crossentropy',
                optimizer = 'adam',
                metrics=['accuracy'])

In [63]:
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                                       'model_3_GRU')])

Saving TensorBoard log files to: model_logs/model_3_GRU/20211119-221336
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]

array([[1.20712310e-01],
       [7.90693760e-01],
       [9.99902427e-01],
       [1.18103415e-01],
       [1.18792341e-04],
       [9.99479771e-01],
       [8.57475162e-01],
       [9.99948740e-01],
       [9.99902606e-01],
       [8.17297697e-01]], dtype=float32)

In [65]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_pred_probs[:10]

array([[1.20712310e-01],
       [7.90693760e-01],
       [9.99902427e-01],
       [1.18103415e-01],
       [1.18792341e-04],
       [9.99479771e-01],
       [8.57475162e-01],
       [9.99948740e-01],
       [9.99902606e-01],
       [8.17297697e-01]], dtype=float32)

In [66]:
model_3_results = calculate_results(y_true=val_labels,
                                    y_pred=model_3_preds)


model_3_results

{'accuracy': 77.03412073490814,
 'f1': 0.7696090403305004,
 'precision': 0.7700604729059295,
 'recall': 0.7703412073490814}

In [67]:
## Model 4: Bidirectional RNN
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
#x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs, name = 'model_4_bidirectional')

In [68]:
model_4.summary()

Model: "model_4_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,3

In [69]:
# Compile the model
model_4.compile(loss='binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [70]:
# Fit the model
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                                       'model_4_bidirectional')])

Saving TensorBoard log files to: model_logs/model_4_bidirectional/20211119-221420
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

array([[2.0465255e-04],
       [8.0620629e-01],
       [9.9998951e-01],
       [4.3168306e-02],
       [1.4240984e-05],
       [9.7939450e-01],
       [1.2916058e-02],
       [9.9999142e-01],
       [9.9998724e-01],
       [9.9699759e-01]], dtype=float32)

In [72]:
# Convert pred probs to pred labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 0., 1., 1., 1.], dtype=float32)>

In [73]:
model_4_results = calculate_results(y_true=val_labels,
                                    y_pred = model_4_preds)

model_4_results

{'accuracy': 77.16535433070865,
 'f1': 0.7685968913498257,
 'precision': 0.7755054154623078,
 'recall': 0.7716535433070866}

In [74]:
baseline_results

{'accuracy': 79.26509186351706,
 'f1': 0.7862189758049549,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706}

In [75]:
# Test out embedding layer, Conv1D layer and max pooling
embedding_test = embedding(text_vectorizer(["this is a test sentence"]))
conv_1d = layers.Conv1D(filters=32,
                        kernel_size=5,
                        activation='relu',
                        padding="valid")

conv_1d_output = conv_1d(embedding_test)
max_pool = layers.GlobalMaxPooling1D()
max_pool_output = max_pool(conv_1d_output)

embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))

In [76]:
embedding_test

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.0567638 ,  0.00620732,  0.0586348 , ...,  0.02329377,
          0.05133513,  0.02865776],
        [-0.02867325, -0.022655  , -0.01895995, ...,  0.02158034,
          0.04950592,  0.05161981],
        [ 0.03267182,  0.04003241, -0.0118691 , ...,  0.0037484 ,
          0.04170733,  0.00421341],
        ...,
        [ 0.03485788, -0.03916227,  0.04025098, ..., -0.01422921,
          0.00917294,  0.00254906],
        [ 0.03485788, -0.03916227,  0.04025098, ..., -0.01422921,
          0.00917294,  0.00254906],
        [ 0.03485788, -0.03916227,  0.04025098, ..., -0.01422921,
          0.00917294,  0.00254906]]], dtype=float32)>

In [77]:
conv_1d_output

<tf.Tensor: shape=(1, 11, 32), dtype=float32, numpy=
array([[[0.03841533, 0.        , 0.        , 0.        , 0.01076279,
         0.05258829, 0.07141263, 0.        , 0.        , 0.        ,
         0.        , 0.02508487, 0.        , 0.05835299, 0.06922501,
         0.        , 0.        , 0.08665226, 0.        , 0.        ,
         0.00590851, 0.03659828, 0.        , 0.04026986, 0.0664205 ,
         0.00751897, 0.        , 0.04763882, 0.        , 0.07003564,
         0.02085503, 0.00830361],
        [0.02967061, 0.        , 0.        , 0.03037643, 0.        ,
         0.08793002, 0.01674295, 0.03069116, 0.        , 0.08105101,
         0.        , 0.02305288, 0.02485244, 0.02389897, 0.05293202,
         0.        , 0.        , 0.03603096, 0.        , 0.00938548,
         0.00324193, 0.04961966, 0.03328489, 0.06752726, 0.0119764 ,
         0.03745362, 0.        , 0.        , 0.        , 0.0533834 ,
         0.00485062, 0.        ],
        [0.05926793, 0.        , 0.        , 0.    

In [82]:
# Create a 1D convolution layer to model sequences
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5,strides=1 ,activation='relu', padding='valid')(x)
x = layers.GlobalMaxPooling1D()(x)
# x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_5 = tf.keras.Model(inputs, outputs, name = 'model_5_Conv1D')

# Complile Conv1D model
model_5.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])


In [83]:
model_5.summary()

Model: "model_5_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_3 (Conv1D)           (None, 11, 64)            41024     
                                                                 
 global_max_pooling1d_3 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 1)              

In [90]:
# fit the model
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data = (val_sentences,val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                                       'Conv1D')])

Saving TensorBoard log files to: model_logs/Conv1D/20211119-224715
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [101]:
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]

array([[3.0590415e-02],
       [9.5465422e-01],
       [9.9979925e-01],
       [5.9175134e-02],
       [3.4239676e-07],
       [9.8799354e-01],
       [9.7471642e-01],
       [9.9992019e-01],
       [9.9999881e-01],
       [8.5584342e-01]], dtype=float32)

In [102]:
# Convert model 5 pred probs to labels
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [103]:
# evaluate model5
model_5_results = calculate_results(y_true=val_labels,
                                    y_pred = model_5_preds)

model_5_results

{'accuracy': 75.19685039370079,
 'f1': 0.7497885511234939,
 'precision': 0.7529888699847289,
 'recall': 0.7519685039370079}

In [104]:
baseline_results

{'accuracy': 79.26509186351706,
 'f1': 0.7862189758049549,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706}

In [105]:
# Model 6 : tensorflow hub pretrained model (USE feature extractor)
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed_samples = embed([sample_sentence,
                       "When you call the universal sentence encoder on a sentence, it turn it into numbers"])

print(embed_samples[0][:50])

tf.Tensor(
[-0.01157025  0.02485911  0.02878051 -0.012715    0.03971541  0.08827761
  0.02680988  0.05589839 -0.01068731 -0.00597293  0.00639321 -0.01819516
  0.00030816  0.09105889  0.05874646 -0.03180629  0.01512474 -0.05162925
  0.00991366 -0.06865345 -0.04209306  0.0267898   0.03011009  0.00321065
 -0.00337968 -0.04787357  0.0226672  -0.00985927 -0.04063615 -0.01292093
 -0.04666382  0.05630299 -0.03949255  0.00517682  0.02495827 -0.07014439
  0.0287151   0.0494768  -0.00633978 -0.08960193  0.02807119 -0.00808364
 -0.01360601  0.05998649 -0.10361788 -0.05195373  0.00232958 -0.02332531
 -0.03758106  0.03327729], shape=(50,), dtype=float32)


In [106]:
embed_samples

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[-0.01157025,  0.02485911,  0.02878051, ..., -0.00186124,
         0.02315822, -0.01485021],
       [ 0.03569669, -0.08161653, -0.03142678, ..., -0.05226755,
         0.03114403, -0.00885672]], dtype=float32)>

In [107]:
sample_sentence

"There's a flood in my street!"

In [109]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", 
                                         input_shape=[], # input can be variable length, output is a vector of length 512
                                         dtype=tf.string,
                                         trainable=False,
                                         name='USE')

In [128]:
# Create model using sequential API
model_6 = tf.keras.Sequential([
  sentence_encoder_layer,
  layers.Dense(64, activation='relu'),
  layers.Dense(1, activation='sigmoid')
], name='model_6_USE')

# Compile model
model_6.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [129]:
model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_10 (Dense)            (None, 64)                32832     
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [130]:
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     'tf_hub_USE')])

Saving TensorBoard log files to: model_logs/tf_hub_USE/20211120-001543
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [131]:
# make predictions with USE model
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs[:10]

array([[0.19140458],
       [0.79184014],
       [0.98871124],
       [0.21091512],
       [0.8018575 ],
       [0.76326025],
       [0.9795474 ],
       [0.9829999 ],
       [0.9442698 ],
       [0.10509592]], dtype=float32)

In [132]:
# Convert prediction probabilities to labels
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 1., 1., 1., 1., 1., 0.], dtype=float32)>

In [133]:
# model_6 results
model_6_results = calculate_results(y_true = val_labels,
                                    y_pred = model_6_preds)

model_6_results

{'accuracy': 82.1522309711286,
 'f1': 0.821051835664206,
 'precision': 0.8215093851904876,
 'recall': 0.821522309711286}

In [134]:
# Model_7: TF Hub with pretrained USE on only 10 percent of the data
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [137]:
train_10_precent = train_df_shuffled[["text", "target"]].sample(frac=0.1, random_state=42)
train_sentences_10_precent = train_10_precent["text"].to_list()
train_labels_10_precent = train_10_precent['target'].to_list()

In [138]:
len(train_df_shuffled), len(train_10_precent)

(7613, 761)

In [139]:
# Check the number of targets inout subset of data
train_10_precent['target'].value_counts()

0    413
1    348
Name: target, dtype: int64

In [140]:
train_df_shuffled['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [162]:
# Clone model_6
model_7 = tf.keras.models.clone_model(model_6)

# Compile model
model_7.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [163]:
model_7.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_10 (Dense)            (None, 64)                32832     
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [152]:
model_7_history = model_7.fit(train_sentences_10_precent,
                              train_labels_10_precent,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     'tf_hub_USE_10_percent')])

Saving TensorBoard log files to: model_logs/tf_hub_USE_10_percent/20211120-004545
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [153]:
# make predictions with USE model
model_7_pred_probs = model_7.predict(val_sentences)
model_7_pred_probs[:10]

array([[0.14635497],
       [0.7494444 ],
       [0.9975699 ],
       [0.29313022],
       [0.7634861 ],
       [0.8319789 ],
       [0.98993653],
       [0.9963395 ],
       [0.97040606],
       [0.03203771]], dtype=float32)

In [154]:
# Convert prediction probabilities to labels
model_7_preds = tf.squeeze(tf.round(model_7_pred_probs))
model_7_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 1., 1., 1., 1., 1., 0.], dtype=float32)>

In [155]:
# model_7 results
model_7_results = calculate_results(y_true = val_labels,
                                    y_pred = model_7_preds)

model_7_results

{'accuracy': 87.00787401574803,
 'f1': 0.8692736261433838,
 'precision': 0.8722368010352193,
 'recall': 0.8700787401574803}

In [156]:
model_6_results

{'accuracy': 82.1522309711286,
 'f1': 0.821051835664206,
 'precision': 0.8215093851904876,
 'recall': 0.821522309711286}

In [159]:
# Fixing the data leakage of the 10 percent data
train_10_precent_split = int(0.1*len(train_sentences))
train_sentences_10_precent = train_sentences[:train_10_precent_split]
train_labels_10_precent = train_labels[:train_10_precent_split]

In [160]:
train_df_shuffled['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [164]:
model_7_history = model_7.fit(train_sentences_10_precent,
                              train_labels_10_precent,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     'tf_hub_USE_10_percent_fixed')])

Saving TensorBoard log files to: model_logs/tf_hub_USE_10_percent_fixed/20211120-010906
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
