### NLP PIPELINE

* Data Acquisition : Data collecting for NLP task (use public dataset, web scrapping)
* Text Extractions & Cleanup : Removing extra info and data cleaning (spelling correcttion)
* Sectence segmentation : Dividing the text into sentences (into segments)
* Word Tokenizer : Dividing the sentence into words
* Stemming : Removing ing, s from words to it's base word (loves->love)(eating -> eat)
* Lammatization : converting word to base word like (ate->eat)(was -> to be), Stemming don't know grammar  
* Feature Engineering : Converting words to numbers ( extracting feature) eg- (TF-IDF Vectorizer, 1 Hot Encoding, Word Embedding)
* Model Building : Building ml model to classify
* Model Accuracy Evaluation

### Word2Vector

In [1]:
import gensim
import pandas as pd

In [None]:
df = pd.read_json()
df.head()

In [None]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)


In [None]:
model = gensim.models.Word2Vec(
    window=10,   #10 words before target word and 10 words after the target word 
    min_count=2, # only consisder word which has minimum 2 letters
    workers=4    # use 4 threads to train model
)

In [None]:
model.build_vocab(review_text,progress_per=1000)  # after processing 1000 words the progress will be updated

In [None]:
                                #no of reviews
model.train(review_text,total_examples=model.corpus_count, epochs=model.epochs)  

In [None]:
model.save("wordTovectorModel.model")


In [None]:
model.wv.most_similar("bad")

In [None]:
model.wv.similarity(w1="good",w2="nice")

# BERT

In [3]:
import tensorflow_hub as hub
import tensorflow_text as text

In [18]:
preprocess = hub.load("preprocessor")
bert_preprocess_model = hub.KerasLayer(preprocess)

In [22]:
text_test = ["nice movie indeed","I love python programming"]
text_preprocessed = bert_preprocess_model(text_test)
text_preprocessed.keys()

dict_keys(['input_word_ids', 'input_mask', 'input_type_ids'])

In [23]:
text_preprocessed['input_mask']
# we add two token CLS in front and SEP at end
# CLS nice movie indeed SEP           # 5 words
# CLS I love python programming SEP   # 6 words
# Total length of sentence 128 so others are 0

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [24]:
text_preprocessed['input_word_ids']
# every word has it word id of vocab
# CLS - 101 and SEP - 102

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[  101,  3505,  2523,  5750,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [26]:
bert_encoder_model = hub.KerasLayer(hub.load("encoder"))

In [28]:
bert_results = bert_encoder_model(text_preprocessed)
bert_results.keys()

dict_keys(['pooled_output', 'encoder_outputs', 'sequence_output', 'default'])

In [30]:
bert_results['pooled_output']
# it gives word embedding for entire sentence of sie of 768

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.900717  , -0.3476328 , -0.1821752 , ...,  0.06183419,
        -0.6366759 ,  0.87090707],
       [-0.7669953 , -0.22327538,  0.32057914, ...,  0.25097713,
        -0.51727307,  0.71171033]], dtype=float32)>

In [37]:
bert_results['sequence_output']
# it will give embedding of each word of 768 size
# sentence has padding
# nice movie indeed 0 0 0 0 0 ..0  <-128 size
# paddings are 0 but there embedding has numbers bcoz it is contextualised embedding 

<tf.Tensor: shape=(2, 128, 768), dtype=float32, numpy=
array([[[-0.1951019 ,  0.2909121 , -0.18963815, ...,  0.02199245,
          0.28896257,  0.24759115],
        [ 0.29499754,  0.12620637,  0.17301074, ..., -0.03387026,
         -0.12297036, -0.17914426],
        [-0.4138534 ,  0.2814847 , -0.20537418, ...,  0.03129006,
         -0.00604016,  0.10353359],
        ...,
        [ 0.13616714, -0.06290523,  0.19885191, ...,  0.17306995,
          0.04837074, -0.05333774],
        [ 0.09686777, -0.1243443 ,  0.15699324, ...,  0.17266199,
          0.04439491, -0.05265225],
        [-0.19986477, -0.30589673, -0.1120979 , ...,  0.33831328,
          0.0027611 ,  0.2169442 ]],

       [[-0.17158662,  0.22742605,  0.12970155, ..., -0.20270583,
          0.28589737,  0.15177065],
        [ 0.2933505 ,  0.20694976,  0.35201114, ..., -0.0066933 ,
          0.5498753 , -0.11189746],
        [ 0.769707  ,  0.6234179 ,  0.8985071 , ..., -0.126248  ,
          0.5042638 , -0.36060262],
        ...,

In [35]:
bert_results['encoder_outputs']
# len(bert_result['encoder_output']) --> 12
# it gives output of all 12 layer
# last layer output is same as sequence output

[<tf.Tensor: shape=(2, 128, 768), dtype=float32, numpy=
 array([[[ 0.1980502 ,  0.0426916 , -0.07409655, ..., -0.01410131,
           0.05073649,  0.07481836],
         [-0.2658456 ,  0.707153  ,  1.1073617 , ..., -0.0173998 ,
           0.6247519 , -0.32139653],
         [-0.8986099 ,  1.4299189 , -0.7438352 , ..., -0.2257171 ,
           0.22594899,  0.11610851],
         ...,
         [ 0.07487455, -0.11181764,  0.49887004, ...,  0.24904239,
          -0.4995504 ,  0.3929476 ],
         [-0.02745285, -0.16453704,  0.3553393 , ...,  0.2789439 ,
          -0.39104328,  0.2649659 ],
         [ 0.09480309, -0.03973748,  0.4065872 , ...,  0.65463954,
          -0.71882534,  0.17486356]],
 
        [[ 0.21599416, -0.03440646, -0.0547406 , ..., -0.06578975,
           0.04412308,  0.01117172],
         [ 0.22251882, -0.28924823,  0.16644225, ...,  0.37934244,
           0.351004  , -0.3230216 ],
         [ 1.1116548 ,  0.06930713,  0.54364604, ...,  0.651603  ,
          -0.24834467, -0.83