<a href="https://colab.research.google.com/github/JaperTai77/Tensorflow_DL/blob/main/tf2_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Preparation

#### Get data

In [1]:
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

f = "nlp_getting_started.zip"
import zipfile
zip_ref = zipfile.ZipFile(f, "r")
zip_ref.extractall()
zip_ref.close()

--2021-11-15 08:22:45--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.152.128, 209.85.200.128, 74.125.129.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.152.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2021-11-15 08:22:45 (103 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [2]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Preprocessing

#### Shuffle

In [3]:
train = train.sample(frac = 1, random_state=42)
train.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

#### Split data

In [4]:
from sklearn.model_selection import train_test_split

train_sentence, val_sentence, train_label, val_label = train_test_split(train['text'].to_numpy(),
                                                                        train['target'].to_numpy(),
                                                                        test_size = 0.1)

In [None]:
train_sentence[:10]

array(["'There was a small earthquake in LA but don't worry Emily Rossum is fine' #difficultpeople is great",
       "I'll cry until my pity party's in flames ????",
       "There has not been 1 real tear out of #Shelli 's eyes this entire episode. #bb17",
       "@FurTrix then find cougars who look like her even better if they're in military uniform!",
       "'Sometimes God uses sorrowful tragedy to set the stage for glorious redemption.' -David Platt Run for\x89Û_ https://t.co/86V81dv00E",
       'U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... https://t.co/m8MvDSPJp7 via @Change',
       '\x89ÛÏRichmond Coaches were devastated to hear of the death of their second driver Mr Chance who was sitting\x89Û_: Jam... http://t.co/dIalTa6t69',
       "@asymbina @tithenai I'm hampered by only liking cross-body bags. I really like Ella Vickers bags: machine washable. http://t.co/YsFYEahpVg",
       '@r_lauren83199 @xojademarie124 i hope y

#### Visualize random example

In [None]:
import random
random_index = random.randint(0,len(train)-3)
for row in train[['text','target']][random_index:random_index+3].itertuples():
  _, text, target = row
  print(f'Target: {target}', 'positive' if target == 0 else 'negative')
  print(f'Text: \n {text}')
  print('------')

Target: 0 positive
Text: 
 Welcome @djryanwolf @djcoreygrand @djknyce @djoneplustwo @OfficialCoreDJs #Family #Cleveland #StandUp @IAMTONYNEAL http://t.co/P6GqmCTgLj
------
Target: 0 positive
Text: 
 @spookyfob @feelslikefob I am okay thank you yes your kindness is fatal though it's like Patrick stump level kindness.
------
Target: 1 negative
Text: 
 Hundreds feared drowned after another Mediterranean asylum seeker boat sinking http://t.co/zsYkzj2bzG
------


#### Tokenization (text vectorization)

In [5]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization as token

text_vector = token(max_tokens = None, # Maximum size of the vocabulary
                    standardize="lower_and_strip_punctuation", # how to process text
                    split = 'whitespace',

                    ngrams=None, # no grouping, treat every token on its own
                    output_mode="int", # how to map tokens to numbers
                    # define sequence length, so all input sequence are in the same length
                    output_sequence_length=None, 
                    # pad_to_max_tokens=True (fill zeros to fit the length)
                    # Not valid if using max_tokens=None
                    )

In [6]:
# Set up vectorization
max_vocab_length = 1000
max_sent_length = 15

text_vector = token(max_tokens=max_vocab_length,
                    output_mode="int",
                    output_sequence_length=max_sent_length)

# Transform
text_vector.adapt(train_sentence)

# example
text_vector(['There is a car nearby.'])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 75,   9,   3, 133, 642,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

Example

In [7]:
def random_tokenize(text = train_sentence):
  random_sentence = random.choice(text)
  print('Original text:\n %s \n\n Vectorized version:\n %s' 
        % (random_sentence,text_vector([random_sentence])))

In [None]:
random_tokenize()

Original text:
 Debris confirmed from MH370; relatives hope for discovery of crash site: Malaysian officials confirm a breakth... http://t.co/MGYVGlENKS 

 Vectorized version:
 tf.Tensor([[250 291  20 177   1 218  10   1   6  83 579   1 538   1   3]], shape=(1, 15), dtype=int64)


Get unique word

In [None]:
words = text_vector.get_vocabulary()
wordstop5 = words[:5]
wordslow5 = words[-5:]
print(f"Number of words in vocab: {len(words)}")
print(f"Top 5 most common words: {wordstop5}") 
print(f"Bottom 5 least common words: {wordslow5}")

Number of words in vocab: 1000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['risk', 'reports', 'pradesh', 'patience', 'pamela']


#### Embedding

In [7]:
import random
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length, #input shape
                                      output_dim = 128,
                                      input_length = max_sent_length # length of each input
                                      )

embedding(text_vector([random.choice(train_sentence)]))

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04116246, -0.0365643 ,  0.01177046, ...,  0.00153387,
          0.00119026,  0.04255948],
        [-0.0441795 , -0.00272492, -0.00834078, ..., -0.01525576,
         -0.03016746, -0.02198253],
        [-0.0441795 , -0.00272492, -0.00834078, ..., -0.01525576,
         -0.03016746, -0.02198253],
        ...,
        [ 0.03900604, -0.04213583,  0.01094236, ..., -0.00350801,
          0.04636893, -0.01434063],
        [-0.0441795 , -0.00272492, -0.00834078, ..., -0.01525576,
         -0.03016746, -0.02198253],
        [ 0.04630421,  0.03839452,  0.0183639 , ..., -0.01053084,
         -0.03241856, -0.01439898]]], dtype=float32)>

## Models

### Model 0 (Naive Bayes)

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model0 = Pipeline([
                  ('tfidf', TfidfVectorizer()),
                  ('clf', MultinomialNB())
])

model0.fit(train_sentence,train_label)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [83]:
model0_score = model0.score(val_sentence,val_label)
print('Model0 score: %s' %model0_score)

Model0 score: 0.8097112860892388


In [None]:
from sklearn.metrics import classification_report

pred0 = model0.predict(val_sentence)
print(classification_report(val_label, pred0))

              precision    recall  f1-score   support

           0       0.76      0.92      0.83       433
           1       0.86      0.61      0.72       329

    accuracy                           0.79       762
   macro avg       0.81      0.77      0.77       762
weighted avg       0.80      0.79      0.78       762



### Model 1 (Dense Model)

In [84]:
import tensorflow as tf

input = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
x = text_vector(input)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) # condense each tokened feature vector to one vector
# otherwise will return a prob for each word than for the whole sentence
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
model1 = tf.keras.Model(input,output, name = 'model1')
model1.summary()

Model: "model1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           128000    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 128,129
Trainable params: 128,129
Non-trainabl

In [85]:
model1.compile(loss = 'binary_crossentropy',
               optimizer=tf.keras.optimizers.Adam(),
               metrics = ['accuracy'])
history1 = model1.fit(train_sentence, train_label, epochs = 5,
                     validation_data = (val_sentence,val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [86]:
model1.evaluate(val_sentence,val_label)



[0.4195030927658081, 0.8070865869522095]

In [87]:
import numpy as np
prob1 = model1.predict(val_sentence)
print(np.round(prob1[0:5],3))
pred1 = tf.squeeze(tf.round(prob1))

[[0.999]
 [0.928]
 [0.843]
 [0.999]
 [0.385]]


In [88]:
from sklearn.metrics import classification_report

print(classification_report(val_label, pred1))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       441
           1       0.79      0.74      0.76       321

    accuracy                           0.81       762
   macro avg       0.80      0.80      0.80       762
weighted avg       0.81      0.81      0.81       762



#### Visualizing learned embeddings¶

In [None]:
words = text_vector.get_vocabulary()
print(len(words))
embed_weight = model1.get_layer('embedding_1').get_weights()[0]
embed_weight.shape

1000


(1000, 128)

128 vectors

http://projector.tensorflow.org (use chrome)

In [None]:
import io

# Create output writers
out_v = io.open("embedding_vectors.tsv", "w", encoding="utf-8")
out_m = io.open("embedding_metadata.tsv", "w", encoding="utf-8")

# Write embedding vectors and words to file
for num, word in enumerate(words):
  if num == 0: 
      continue # skip padding token
  vec = embed_weight[num]
  out_m.write(word + "\n") # write words to file
  out_v.write("\t".join([str(x) for x in vec]) + "\n") # write corresponding word vector to file
out_v.close()
out_m.close()

# # Download files locally to upload to Embedding Projector
# try:
#   from google.colab import files
# except ImportError:
#   pass
# else:
#   files.download("embedding_vectors.tsv")
#   files.download("embedding_metadata.tsv")

### Model 2 (RNN-LSTM)

In [None]:
input = tf.keras.layers.Input(shape = (1,),dtype = tf.string)
x = text_vector(input)
print(x.shape)
x = embedding(x)
print(x.shape)
x = tf.keras.layers.LSTM(64,return_sequences= True)(x)
# when stacking RNN cells, need to return sequence otherwise error
print(x.shape) # (batch, timestamps, feature)
x = tf.keras.layers.LSTM(64)(x)
print(x.shape)
x = tf.keras.layers.Dense(64, activation = 'relu')(x)
print(x.shape)
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
model2 = tf.keras.Model(input,output,name = 'model2')

(None, 15)
(None, 15, 128)
(None, 15, 64)
(None, 64)
(None, 64)


In [None]:
model2.compile(loss = tf.keras.losses.BinaryCrossentropy(),
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ['accuracy'])
history2 = model2.fit(train_sentence,train_label,epochs = 5,
                     validation_data = (val_sentence,val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
prob2 = model2.predict(val_sentence)
pred2 = tf.squeeze(tf.round(prob2))

from sklearn.metrics import classification_report

print(classification_report(val_label, pred2))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82       433
           1       0.80      0.67      0.73       329

    accuracy                           0.79       762
   macro avg       0.79      0.77      0.78       762
weighted avg       0.79      0.79      0.78       762



### Model 3 (GRU)

In [None]:
import tensorflow as tf

input = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
x = text_vector(input)
x = embedding(x)
#x = tf.keras.layers.GRU(64,return_sequences = True)(x)
#x = tf.keras.layers.LSTM(42,return_sequences = True)(x)
x = tf.keras.layers.GRU(64)(x)
x = tf.keras.layers.Dense(64, activation = 'relu')(x)
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
model3 = tf.keras.Model(input,output)
model3.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           128000    
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                             

In [None]:
model3.compile(loss = 'binary_crossentropy',
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ['accuracy'])

history3 = model3.fit(train_sentence, train_label,epochs = 5,
                     validation_data = (val_sentence, val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
prob3 = model3.predict(val_sentence)
pred3 = tf.squeeze(tf.round(prob3))

from sklearn.metrics import classification_report

print(classification_report(val_label, pred3))

              precision    recall  f1-score   support

           0       0.77      0.86      0.81       433
           1       0.78      0.66      0.72       329

    accuracy                           0.77       762
   macro avg       0.78      0.76      0.76       762
weighted avg       0.78      0.77      0.77       762



### Model 4 (Bidirectional)

In [10]:
input = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
x = text_vector(input)
x = embedding(x)
print(x.shape)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences = True))(x)
print(x.shape)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64))(x)
print(x.shape)
output = tf.keras.layers.Dense(1,activation = 'sigmoid')(x)
model4 = tf.keras.Model(input,output)
model4.summary()

(None, 15, 128)
(None, 15, 128)
(None, 128)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           128000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 15, 128)          98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                 

In [12]:
model4.compile(loss = 'binary_crossentropy',
               optimizer = tf.keras.optimizers.Adam(),
               metrics = 'accuracy')
history4 = model4.fit(train_sentence, train_label,epochs = 5,
                     validation_data = (val_sentence,val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
prob4 = model4.predict(val_sentence)
pred4 = tf.squeeze(tf.round(prob4))

from sklearn.metrics import classification_report

print(classification_report(val_label, pred4))

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       432
           1       0.85      0.57      0.68       330

    accuracy                           0.77       762
   macro avg       0.80      0.75      0.75       762
weighted avg       0.79      0.77      0.76       762



### Model 5 (Conv1D)

In [9]:
embedding_test = embedding(text_vector(['This is a sample sentence']))
conv1d = tf.keras.layers.Conv1D(filters=32,
                                kernel_size = 5, # 5 words at a time
                                strides = 1, # hop through one at a time
                                activation = 'relu',
                                padding = 'same') # don't change output size
conv1d_out = conv1d(embedding_test)
max_pool = tf.keras.layers.GlobalMaxPool1D() 
# get the most important parameter
max_pool_out = max_pool(conv1d_out)

print(embedding_test.shape, conv1d_out.shape, max_pool_out.shape)

(1, 15, 128) (1, 15, 32) (1, 32)


In [13]:
import tensorflow as tf
input = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
x = text_vector(input)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters = 64,kernel_size = 5, strides = 1, 
                           activation = 'relu', padding = 'valid')(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
# x = tf.keras.layers.Dense(64, activation = 'relu')
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
model5 = tf.keras.Model(input,output)

model5.compile(loss = 'binary_crossentropy',
              optimizer = 'Adam',
              metrics = ['accuracy'])
model5.summary()  

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           128000    
                                                                 
 conv1d_2 (Conv1D)           (None, 11, 64)            41024     
                                                                 
 global_average_pooling1d_1   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 1)                 65  

In [14]:
history5 = model5.fit(train_sentence,train_label, epochs = 5, validation_data = (val_sentence,val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
prob5 = model5.predict(val_sentence)
pred5 = tf.squeeze(tf.round(prob5))

from sklearn.metrics import classification_report

print(classification_report(val_label, pred5))

              precision    recall  f1-score   support

           0       0.76      0.89      0.82       435
           1       0.81      0.62      0.70       327

    accuracy                           0.77       762
   macro avg       0.78      0.75      0.76       762
weighted avg       0.78      0.77      0.77       762



### Model 6 (Transfer learning)

In [8]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [9]:
import tensorflow as tf
encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                               input_shape = [], # model does not need input shape
                               dtype = tf.string,
                               trainable = False,
                               name = 'USE')

model6 = tf.keras.Sequential([
                              encoder_layer,
                              tf.keras.layers.Dense(64, activation = 'relu'),
                              tf.keras.layers.Dense(1,activation = 'sigmoid')
                              ], name = 'model6')

model6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model6.summary()

Model: "model6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 64)                32832     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [10]:
history6 = model6.fit(train_sentence, train_label,
                     epochs = 5, validation_data = (val_sentence,val_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
prob6 = model6.predict(val_sentence)
pred6 = tf.squeeze(tf.round(prob6))

from sklearn.metrics import classification_report

print(classification_report(val_label, pred6))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       441
           1       0.82      0.74      0.78       321

    accuracy                           0.82       762
   macro avg       0.82      0.81      0.81       762
weighted avg       0.82      0.82      0.82       762



### Model 7 (10% data)

In [22]:
train10 = train[['text', 'target']].sample(frac = 0.1)
train10_sentence = train10['text'].to_list()
train10_label = train10['target'].to_list()

In [23]:
train10['target'].value_counts()

0    420
1    341
Name: target, dtype: int64

In [25]:
model7 = tf.keras.models.clone_model(model6)

model7.compile(loss = tf.keras.losses.BinaryCrossentropy(),
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ['accuracy'])
model7.summary()

Model: "model6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_3 (Dense)             (None, 64)                32832     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [27]:
model7.fit(train10_sentence, train10_label, epochs = 5, validation_data = (train_sentence,train_label))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7efdc012e050>

In [28]:
model7.evaluate(val_sentence,val_label)



[0.4847722053527832, 0.7716535329818726]

In [29]:
prob7 = model7.predict(val_sentence)
pred7 = tf.squeeze(tf.round(prob7))

from sklearn.metrics import classification_report

print(classification_report(val_label, pred7))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80       435
           1       0.74      0.72      0.73       327

    accuracy                           0.77       762
   macro avg       0.77      0.77      0.77       762
weighted avg       0.77      0.77      0.77       762



There will be data leakage problem, since some validation data are in train data.\
Validation data took from

In [None]:
# Correct split
_, train10_sentence, _, train10_label = train_test_split(np.array(train_sentences),
                                                                                    train_labels,test_size=0.1)

### Save best model

In [12]:
model6.save('NLP_model.h5')

In [14]:
# Import transfer learning model
import tensorflow_hub as hub
loaded_model = tf.keras.models.load_model('NLP_model.h5',
                                          custom_objects = {'KerasLayer':hub.KerasLayer}
                                          # for encoder_layer which we used in model6
                                          )

In [15]:
model6.evaluate(val_sentence,val_label) == loaded_model.evaluate(val_sentence,val_label)



True

In [16]:
# Save model format
model6.save('NLP_model')



INFO:tensorflow:Assets written to: NLP_model/assets


INFO:tensorflow:Assets written to: NLP_model/assets


In [17]:
loaded_model = tf.keras.models.load_model("NLP_model")
model6.evaluate(val_sentence,val_label) == loaded_model.evaluate(val_sentence,val_label)



True

## Inspection

In [20]:
prob6 = model6.predict(val_sentence)
pred6 = tf.squeeze(tf.round(prob6))

In [31]:
df = pd.DataFrame({'text':val_sentence, 'true_label': val_label,'pred_label': pred6})
df['probability'] = prob6

In [32]:
df['prob'] = df.apply(lambda x: x['probability'] if x['pred_label']== 1 else 1-x['probability'],axis = 1)
df = df.drop(columns=['probability'])
df['accuracy'] = df.apply(lambda x: 1 if x['true_label'] == x['pred_label'] else 0, axis = 1)
df.head()

Unnamed: 0,text,true_label,pred_label,prob,accuracy
0,Families to sue over Legionnaires: More than 4...,1,1.0,0.973461,1
1,Governor allows parole for California school b...,1,1.0,0.910551,1
2,Police investigating after an e-bike collided ...,1,1.0,0.96718,1
3,...//..// whao.. Pic of 16yr old PKK suicide b...,1,1.0,0.985293,1
4,the stars are burning i here your voice in my ...,0,0.0,0.913602,1


In [40]:
df_wrong = df[df['accuracy'] != 1].sort_values(by = 'prob',ascending = False)

In [42]:
for row in df_wrong[:20].itertuples():
  _,text,target,pred,prob,acc = row
  print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

Target: 0, Pred: 1, Prob: 0.9769226312637329
Text:
Teen Disaster Preparedness Event in Van Nuys August 11 @ 5:30pm http://t.co/fXUX987vZx via @VanNuysCouncil

----

Target: 0, Pred: 1, Prob: 0.974400520324707
Text:
Mourning notices for stabbing arson victims stir Û÷politics of griefÛª in Israel: Posters for Shira Banki and A... http://t.co/3GZ5zQQTHe

----

Target: 1, Pred: 0, Prob: 0.9649578295648098
Text:
@HaydnExists so glad i saved them all at once then didnÛªt want you stealing my thunder :P

----

Target: 0, Pred: 1, Prob: 0.9646433591842651
Text:
Two Jewish Terrorists Charged In Historic-Church Arson | The Ugly Truth http://t.co/iEksNFSbY7 http://t.co/VWCf3slkrW

----

Target: 1, Pred: 0, Prob: 0.9449977837502956
Text:
Just made anthonys bed considering i destroy it everytime i fall asleep. Smh ????

----

Target: 1, Pred: 0, Prob: 0.9406877979636192
Text:
SANDSTORM!!! WOO HOO!!

----

Target: 1, Pred: 0, Prob: 0.9370287135243416
Text:
I went to pick up my lunch today and the

In [80]:
def model_prediction(sent, model = model6):
  prob = model.predict([sent])
  pred = tf.squeeze(tf.round(prob)).numpy()
  print(f"Pred: {pred}", "(real disaster)" if pred > 0 else "(not real disaster)", f"Prob: {prob[0][0]}")
  print(f"Text:\n{sent}")

In [81]:
text = input('Enter a sentence')
model_prediction(sent=text)

Enter a sentence"Reports that the smoke in Beirut sky contains nitric acid, which is toxic. Please share and refrain from stepping outside unless urgent. #Lebanon"
Pred: 1.0 (real disaster) Prob: 0.9800093770027161
Text:
"Reports that the smoke in Beirut sky contains nitric acid, which is toxic. Please share and refrain from stepping outside unless urgent. #Lebanon"


In [None]:
import time
def pred_timer(model, samples):
  start_time = time.perf_counter() # get start time
  model.predict(samples) # make predictions
  end_time = time.perf_counter() # get finish time
  total_time = end_time-start_time # calculate how long predictions took to make
  time_per_pred = total_time/len(val_sentence) # find prediction time per sample
  return total_time, time_per_pred

model_6_total_pred_time, model_6_time_per_pred = pred_timer(model6, val_sentence)
baseline_total_pred_time, baseline_time_per_pred = pred_timer(model0, val_sentence)
model_1_total_pred_time, model_1_time_per_pred = pred_timer(model1, val_sentence)

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 7))
plt.scatter(model_1_time_per_pred, model_1_results["f1"], label="simple_network")
plt.scatter(baseline_time_per_pred, baseline_results["f1"], label="baseline")
plt.scatter(model_6_time_per_pred, model_6_results["f1"], label="tf_hub_sentence_encoder")
plt.legend()
plt.title("F1-score versus time per prediction")
plt.xlabel("Time per prediction")
plt.ylabel("F1-Score");
