# # Section 10

## NLP Fundamentals in TensorFLow

In [1]:
import tensorflow as tf
print("TensorFlow Version: ", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

TensorFlow Version:  2.9.0
Num GPUs Available:  1


In [6]:
from utils import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

### Load and visualize data

In [7]:
import pandas as pd
train_df = pd.read_csv("../datasets/nlp-getting-started/train.csv")
test_df = pd.read_csv("../datasets/nlp-getting-started/test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
train_df = train_df.sample(frac=1, random_state=42)
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df["text"].to_numpy(),
                                                                            train_df["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)
len(train_sentences), len(val_sentences)

(6851, 762)

In [11]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [16]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [18]:
from tensorflow.keras.layers import TextVectorization

# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [20]:
text_vectorizer.adapt(train_sentences)

In [21]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [22]:
# Get the unique words in vocab:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_word = words_in_vocab[:5]
bottom_5_word = words_in_vocab[-5:]
print(len(words_in_vocab),)
print(top_5_word)
print(bottom_5_word)

10000
['', '[UNK]', 'the', 'a', 'in']
['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']
