# Natural language processing (NLP) 
ability of a computer program to understand human language as it is spoken and written

In [7]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

In [6]:
# get text dataset
dir_path = 'nlp_getting_started'
os.listdir(dir_path)

['sample_submission.csv', 'test.csv', 'train.csv']

In [14]:
train_df = pd.read_csv(os.path.join(dir_path,'train.csv'))
test_df = pd.read_csv(os.path.join(dir_path,'test.csv'))
print(train_df.shape)
train_df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [35]:
#split data to validation sets
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df['text'].to_numpy(), 
                                                                            train_df['target'].to_numpy(),
                                                                            test_size=.1,
                                                                            random_state=42)
train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape                                                                       

((6851,), (762,), (6851,), (762,))

In [37]:
print(train_sentences[:5], train_labels[:5])

["'McFadden Reportedly to Test Hamstring Thursday' via @TeamStream http://t.co/jWq4KvJH2j"
 'w--=-=-=-[ NEMA warns Nigerians to prepare for drought http://t.co/5uoOPhSqU3'
 "When I was cooking earlier I got electrocuted some crucial ?????? now I'm psychic lol"
 "I'm On Fire.  http://t.co/WATsmxYTVa"
 "More than 40 families affected by the fatal outbreak of Legionnaires' disease in Edinburgh are to sue two comp... http://t.co/vsoXioOy78"] [0 1 0 0 1]


### Tokenization vs Embedding
- Tokenization : i use python -> 0 1 2 -> onehot [1,0,0],[0,1,0],[0,0,1]
- Embedding : i use python -> [0.49, 0.005, 0.015] represent of relationships (can set limit size)

<p align=center><img src="https://miro.medium.com/max/1400/1*sAJdxEsDjsPMioHyzlN3_A.png" width=600px/></p>

In [47]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None, # how many vocabulary in all data
                                    standardize='lower_and_strip_punctuation', # just like number [0..255] -> [0..1]
                                    split = 'whitespace',
                                    ngrams=None, # how many word in group (None = just single word not to group)
                                    output_mode='int', # how output look like (int = any specific int number)
                                    output_sequence_length=50 ) # how long vector of sentence (None = as you can)
                                    # pad_to_max_tokens = True  ,pad 0 of sentence as the same shape as longest sentence( Not valid if using max_tokens=None)

In [63]:
# find the average number of tokens (words) in the traning
round(np.sum( [len(sentences.split()) for sentences in train_sentences]) / len(train_sentences))

15

In [66]:
#setup text vectorization variabel
max_vocab_length = 10000
max_length = 15 # first 15 word for each sentences

text_vectorizer = TextVectorization(max_tokens=max_vocab_length, 
                                    output_mode='int', 
                                    output_sequence_length=max_length)

In [67]:
#fit the text vectorizer to the training sets
text_vectorizer.adapt(train_sentences)

In [86]:
# see what it look like?
sample_sentence = 'There a some word here, i dont know'
print(text_vectorizer([sample_sentence]))

tf.Tensor(
[[  75    3   77 1455  127    8   64   99    0    0    0    0    0    0
     0]], shape=(1, 15), dtype=int64)


In [94]:
# get vocabulary
word_in_vocab = text_vectorizer.get_vocabulary()
word_in_vocab[:10] , len(word_in_vocab)

(['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'], 10000)

> order by most common word

UNK = unknow token (word that out of max_tokens)

### Creating an Embedding 
- input_dim = size of out vocaburary
- output_dim = size of vector / [0.45, 0.34 , ... ,n]
- input_lenght = lenght of sentences 

In [100]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim= max_vocab_length, 
                             output_dim= 128 , 
                             input_length = max_length)

In [147]:
# turn index from vectorizer to embed
print(sample_sentence)
sample_embed = embedding(text_vectorizer([sample_sentence]))
print('shape :',sample_embed.shape)
print(sample_embed[0,:5,:5].numpy())

There a some word here, i dont know
shape : (1, 15, 128)
[[ 0.0185656   0.04277043  0.02763495 -0.04788155 -0.00165869]
 [ 0.01865281  0.03292288  0.01582307 -0.04710279  0.01199052]
 [-0.0451471  -0.03773044 -0.04625946 -0.01397262 -0.00819008]
 [-0.01661801 -0.01971419  0.03964323 -0.04099659  0.00684495]
 [ 0.04787305  0.04158909 -0.04524297  0.01562179  0.03026881]]


## Modeling with Varince Model

### Naive Bayes
predict the tag of a text. They calculate the probability of each tag for a given text and then output the tag with the highest one. using with **TF-IDF** formula to convert our word to numbers.

In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

tfidf = TfidfVectorizer()

model_1 = MultinomialNB()
model_1.fit(tfidf.fit_transform(train_sentences), train_labels)

MultinomialNB()

In [167]:
baseline_score = model_1.score(tfidf.transform(val_sentences), val_labels)
print(f'baseline accuracy = {baseline_score*100:.2f}%')

baseline accuracy = 77.82%


In [170]:
y_preds = model_1.predict(tfidf.transform(val_sentences))
y_preds[:50:5]

array([0, 0, 1, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [176]:
# create evaluation function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_score(y_true, y_preds):
    accuracy = accuracy_score(y_true, y_preds)
    precision, recall, f1_score = precision_recall_fscore_support(y_true, y_preds, average='weighted')[:-1]
    evaluation_dict = {'accuracy':accuracy,'precision':precision,'recall':recall,'f1_score':f1_score}
    return evaluation_dict

In [177]:
baseline_result = evaluate_score(val_labels, y_preds)
baseline_result

{'accuracy': 0.7782152230971129,
 'precision': 0.792992256322435,
 'recall': 0.7782152230971129,
 'f1_score': 0.7703527809038113}

### Simple Dense Model

In [234]:
# GlobalAveragePooling1D
input_shape = (1,3,4)
x = tf.random.normal(input_shape)
y = tf.keras.layers.GlobalMaxPooling1D()(x)
print(f'''
{x}:{x.shape}
{y}:{y.shape}
''')


[[[ 0.42048833  0.501536    1.2311906  -0.32959372]
  [-1.2776942  -1.1778567   0.43151617 -0.27620575]
  [-0.41111636 -0.7654225  -1.2105678   0.8450841 ]]]:(1, 3, 4)
[[0.42048833 0.501536   1.2311906  0.8450841 ]]:(1, 4)



In [210]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string) # 1 sentence at a time
x = text_vectorizer(inputs) # text to number
x = embedding(x) # number to embed
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding
outputs = layers.Dense(1, activation='sigmoid')(x) # binary output
model_2 = tf.keras.Model(inputs, outputs, name='model_2_dense')

In [211]:
model_2.summary()

Model: "model_2_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_6 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d_2   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [212]:
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
                metrics=['accuracy'])

hist_2 = model_2.fit(train_sentences,
                     train_labels, 
                     validation_data = (val_sentences, val_labels), 
                     epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [218]:
model_2_proba = model_2.predict(val_sentences).reshape(-1,)
model_2_proba[:5]

array([0.29349154, 0.38088483, 0.40612563, 0.30467096, 0.47155547],
      dtype=float32)

In [224]:
y_preds_2 = np.round(model_2_proba)
y_preds_2[:40]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0.], dtype=float32)

In [225]:
model2_result = evaluate_score(val_labels, y_preds_2)
model2_result

{'accuracy': 0.7611548556430446,
 'precision': 0.8025271006161089,
 'recall': 0.7611548556430446,
 'f1_score': 0.7443266304859304}

Let's visualize what embedding layer learn

In [246]:
embed_weights =  model_2.layers[2].get_weights()[0]
embed_weights.shape

(10000, 128)

> 10000 vocab in 128 dimention

In [251]:
import io

out_v = io.open('vectors.tsv','w', encoding='utf-8')
out_m = io.open('metadata.tsv','w', encoding='utf-8')

for index, word in enumerate(word_in_vocab):
    if index == 0 :
        continue 
    vec = embed_weights[index] # 128 dimention vector
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')
    out_m.write(word + '\n')

out_v.close()
out_m.close()

> then open in [Embedding Projector](https://projector.tensorflow.org/)

### Recurrent Neural Network (RNN)
- use the representation of previous input to aid the later input
- LSTMs and GRU

input -> tokenize -> embed -> Rnn/dense -> output

In [265]:
#LSTMs
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(units=64, activation='tanh',return_sequences=True)(x) #when stack RNN cell together need (return_sequences = True)
x = layers.LSTM(units=64, activation='tanh')(x) # because input of RNN must be 3D
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs, name='LSTM_model')

model_3.summary()

Model: "LSTM_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_16 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_6 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_6 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_7 (LSTM)               (None, 64)                33024     
                                                                 
 dense_13 (Dense)            (None, 64)                4160      
                                                        

In [266]:
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(), 
                optimizer=tf.keras.optimizers.Adam(), 
                metrics=['accuracy'])
hist_3 = model_3.fit(train_sentences,
                     train_labels, 
                     validation_data = (val_sentences, val_labels), 
                     epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [267]:
model_3_proba = model_3.predict(val_sentences).reshape(-1,)
y_preds_3 = np.round(model_3_proba)
y_preds_3[:10]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 1.], dtype=float32)

In [268]:
model3_result = evaluate_score(val_labels, y_preds_3)
model3_result

{'accuracy': 0.7493438320209974,
 'precision': 0.7484425185106894,
 'recall': 0.7493438320209974,
 'f1_score': 0.7483157555089452}

In [277]:
#GRU
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(units=64, activation='tanh')(x) 
outputs = layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs, name='GRU_model')

model_4.summary()

Model: "GRU_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_6 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru_6 (GRU)                 (None, 64)                37248     
                                                                 
 dense_20 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_______________________________________________

In [278]:
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(), 
                optimizer=tf.keras.optimizers.Adam(), 
                metrics=['accuracy'])
hist_4 = model_4.fit(train_sentences,
                     train_labels, 
                     validation_data = (val_sentences, val_labels), 
                     epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [279]:
model_4_proba = model_4.predict(val_sentences).reshape(-1,)
y_preds_4 = np.round(model_4_proba)
y_preds_4[:10]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 1.], dtype=float32)

In [280]:
model4_result = evaluate_score(val_labels, y_preds_4)
model4_result

{'accuracy': 0.7506561679790026,
 'precision': 0.7497637884834458,
 'recall': 0.7506561679790026,
 'f1_score': 0.7495746978116031}