Codes from : https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

In [1]:
#Checking GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-8fcd8e7e-e69a-e605-14ea-eaf124476892)


In [2]:
#Helper function
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2022-10-27 19:26:10--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2022-10-27 19:26:10 (117 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [3]:
#Importing series of helper functions
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

Download a text dataset

In [4]:
#Downloading the Data
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

#Unzip Data
unzip_data("nlp_getting_started.zip")

--2022-10-27 19:29:18--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.68.128, 142.250.4.128, 74.125.24.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.68.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2022-10-27 19:29:18 (86.9 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



Text Dataset Visualizing

In [5]:
#Turn .csv files into pandas DataFrame

#Note : see the target

import pandas as pd
train_df = pd.read_csv("train.csv")
test_df=pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
#Shuffling

train_df_shuffled=train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [7]:
#Test data haven`t got target column

test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [11]:
#Examples of classes
train_df.target.value_counts()
#Binary classification - 0 is 60%, 1 is 40%

0    4342
1    3271
Name: target, dtype: int64

In [12]:
#Total samples
print(f"Total training samples:{len(train_df)}")
print(f"Total test samples:{len(test_df)}")
print(f"Total samples:{len(train_df) + len(test_df)}")

Total training samples:7613
Total test samples:3263
Total samples:10876


In [13]:
#Visualize random training examples

import random 
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target:{target}", "(real disaster" if target>0 else "(not real disaster)")
  print("---\n")

Target:1 (real disaster
---

Target:0 (not real disaster)
---

Target:0 (not real disaster)
---

Target:0 (not real disaster)
---

Target:0 (not real disaster)
---



Split data into training and validation sets

In [15]:
from sklearn.model_selection import train_test_split

#We use train_test_split for split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [16]:
#Length checking
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [17]:
#Lets view first 10 training sentences and labels
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

Converting text into numbers

TEXT Vectorization (tokenization)

In [18]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

#Default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens= None, #vocabulary word count
                                    standardize = "lower_and_strip_punctuation", #how to processing the text
                                    split="whitespace", #how to splitting the tokens
                                    ngrams=None, #n-word creating groups
                                    output_mode="int", #Map tokens to numbers
                                    output_sequence_length=None) #How long the output of the tokens be

In [22]:
#Average number of tokens (words) in training Tweets

round(sum([len(i.split()) for i in train_sentences])/ len(train_sentences))

15

In [23]:
#setup text vectorization with custom variables
max_vocab_length=10000 #max number of words from our vocabulary
max_length = 15 #max length sequences will be ( how many words from a Tweet)
text_vectorizer= TextVectorization(max_tokens=max_vocab_length,
                                   output_mode="int", #integer mapping
                                   output_sequence_length = max_length) 

In [24]:
#Fitting the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [26]:
#Creating samples sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [29]:
#Choose a random sentence from the training dataset and tokenize it

random_sentence = random.choice(train_sentences) #Choosing sentence
print(f"Original text:\n{random_sentence}\
     \n\nVectorized version:")
text_vectorizer([random_sentence]) #tokenize

Original text:
Any disaster impairs mental health especially in vulnerable individuals... http://t.co/ZisuwLqRHf     

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 202,   75,    1, 3640,  651, 2471,    4, 2616, 3745,    1,    0,
           0,    0,    0,    0]])>

In [31]:
#Unique words form vocabulary

words_in_vocab=text_vectorizer.get_vocabulary()
top_5_words=words_in_vocab[:5]  #most common 5 tokens 
#UNK - unknown words
bottom_5_words=words_in_vocab[:-5] #least common 5 tokens

print(f"Number of words in vocab:{len(words_in_vocab)}")
print(f"Top 5 most common words:{top_5_words}")
print(f"Bottom 5 least common words:{bottom_5_words}")

Number of words in vocab:10000
Top 5 most common words:['', '[UNK]', 'the', 'a', 'in']


Creating an Embedding using an Embedding Layer

In [32]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, #input shape
                             output_dim=128, #size embedding vector
                             embeddings_initializer="uniform", #default, initialize randomly
                             input_length=max_length, #length of each input
                             name="embedding_1")

embedding

<keras.layers.core.embedding.Embedding at 0x7f81d435af50>

In [34]:
#Random sentence from training set
random_sentence=random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
       \n\nEmbedded version:")

#Embedding the random sentence
#Turn it into numerical representation
sample_embed=embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Do you feel like you are sinking in low self-image? Take the quiz: http://t.co/bJoJVM0pjX http://t.co/wHOc7LHb5F       

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 2.4647940e-02,  2.2776496e-02, -2.5382414e-03, ...,
          2.8259959e-02,  9.6987858e-03,  1.5659537e-02],
        [-1.2337789e-03,  3.2362938e-03, -2.7702499e-02, ...,
         -1.8581770e-02, -4.5666125e-02,  4.4703130e-02],
        [ 1.3266172e-02, -3.6481820e-02, -2.1324802e-02, ...,
         -3.0233478e-02, -1.1183046e-02,  1.3467669e-03],
        ...,
        [ 4.6574362e-03, -1.4376927e-02, -3.9916981e-02, ...,
          1.2578163e-02,  1.5724450e-05,  4.2785335e-02],
        [ 3.9779518e-02, -3.7826024e-02, -3.6462832e-02, ...,
          2.3625270e-03,  3.3326294e-02,  2.8036680e-02],
        [ 3.9779518e-02, -3.7826024e-02, -3.6462832e-02, ...,
          2.3625270e-03,  3.3326294e-02,  2.8036680e-02]]], dtype=float32)>

In [35]:
#single token`s embedding -  check out 
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.02464794,  0.0227765 , -0.00253824,  0.01031735,  0.04166684,
        0.01954413,  0.0016857 , -0.04235309,  0.00571234, -0.02199888,
        0.00568052, -0.02422286,  0.00787456, -0.04194081,  0.00136618,
        0.01588665, -0.0333156 , -0.0078637 , -0.04849179,  0.02950374,
       -0.01419324,  0.00093408,  0.04470012,  0.04725683, -0.04478859,
        0.04547257,  0.04511995,  0.02154528, -0.02531589,  0.03728738,
        0.04362348,  0.004204  , -0.03202935, -0.00951911,  0.01650684,
        0.00267321, -0.0045835 , -0.02246951, -0.01653922, -0.02357593,
       -0.04066008,  0.01702801,  0.04111478, -0.00181121, -0.04332322,
       -0.02445837, -0.02744229,  0.0035441 , -0.04850289, -0.01735504,
        0.02399952, -0.03893767,  0.00621217,  0.00898981,  0.03337463,
        0.04154594,  0.02768986, -0.00296283, -0.02946706,  0.03341026,
        0.02854392,  0.00102108, -0.01283691,  0.0065983 ,  0.02869121,
       -0.028466

Model 0: Naive Bayes (baseline)

In [39]:
#Getting a baseline

#Importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#Create tokenization and modelling pipeline

model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [41]:
#baseline model achieves
baseline_score=model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of:{baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of:79.27%


In [42]:
#Make_prediction
baseline_preds=model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

Creating an evaluation function for our model experiments

- Accuracy

- Precision

- Recall

- F1-score

In [44]:
#Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  #Calculating model accuracy
  model_accuracy=accuracy_score(y_true, y_pred)*100
  #Calculate model precision, recall  & f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results={"accuracy": model_accuracy,
                 "precision": model_precision,
                 "recall": model_recall,
                 "f1": model_f1}
  return model_results

In [46]:
#Get baseline results (Accuracy, precision, recall, f1)

baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}