## Tokenization

In [1]:
sentence = "The cat jumped over the fence"
sentences = ["The cat jumped over the fence",
             "The dog leaped across the table",
             "The mouse scurried under the chair"]

In [2]:
# Import needed libraries
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
# Define Tokenizer
tokenizer = Tokenizer()
# Fit Tokenizer --> Learning the vocab (unique words)
tokenizer.fit_on_texts([sentence])
# Encode sentence using Tokenizer
print(tokenizer.texts_to_sequences([sentence]))

[[1, 2, 3, 4, 1, 5]]


In [19]:
# View Tokenizer's vocabulary
tokenizer.word_index

{'cat': 2, 'fence': 5, 'jumped': 3, 'over': 4, 'the': 1}

In [21]:
# Perform tokenizer on multiple sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
encoded = tokenizer.texts_to_sequences(sentences)

In [22]:
sentences = ["The cat jumped over the fence",
             "The dog leaped across the table",
             "The mouse scurried under the chair"]
encoded

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 1, 9], [1, 10, 11, 12, 1, 13]]

In [23]:
# View tokenizer's vocabulary
tokenizer.word_index

{'across': 8,
 'cat': 2,
 'chair': 13,
 'dog': 6,
 'fence': 5,
 'jumped': 3,
 'leaped': 7,
 'mouse': 10,
 'over': 4,
 'scurried': 11,
 'table': 9,
 'the': 1,
 'under': 12}

## Word Stemming

In [24]:
# Import needed libraries
import nltk

In [29]:
# Define stemmer and perform stemming
stem_these = ["programe", "programmer", "programming", "programmed"]

stemmer = nltk.stem.PorterStemmer()
for word in stem_these:
  print(stemmer.stem(word))

program
programm
program
program


In [28]:
stemmer.stem("programmed")

'program'

## Transformers Library

In [31]:
# install transformers library
!pip install -q transformers

[K     |████████████████████████████████| 2.8 MB 31.2 MB/s 
[K     |████████████████████████████████| 895 kB 28.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 18.8 MB/s 
[K     |████████████████████████████████| 636 kB 51.6 MB/s 
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[?25h

In [32]:
# Import needed libraries
from transformers import pipeline

In [None]:
# Download the needed pipelines 
sentiment = pipeline("sentiment-analysis")
unmasker = pipeline("fill-mask")
text_generator = pipeline("text-generation")
summarizer = pipeline("summarization")

Sentiment Analysis

In [36]:
# Sentiment Analysis
print(sentiment("The food smelled bad"))
print(sentiment("The wine tasted good"))

[{'label': 'NEGATIVE', 'score': 0.9997983574867249}]
[{'label': 'POSITIVE', 'score': 0.9998596906661987}]


Text Summarization

In [39]:
# Text summarization
article = '''In recent years, there has been an increasing interest in open-ended language generation
            thanks to the rise of large transformer-based language models trained on millions of webpages, 
            such as OpenAI"s famous GPT2 model. The results on conditioned open-ended language generation are impressive, 
            e.g. GPT2 on unicorns, XLNet, Controlled language with CTRL. Besides the improved transformer architecture and
            massive unsupervised training data, better decoding methods have also played an important role.'''

summarized = summarizer(article, min_length=20, max_length=30)
print(summarized)

[{'summary_text': ' In recent years, there has been an increasing interest in open-ended language generation thanks to the rise of large transformer-based language models'}]


Sentence Masking

In [42]:
unmasker.tokenizer.mask_token

'<mask>'

In [43]:
"This person has been" +  unmasker.tokenizer.mask_token + "eating my food everyday"

'This person has been {unmasker.tokenizer.mask_token} eating my food everyday'

In [41]:
# Fill up a sentence
unmasker(f"This person has been {unmasker.tokenizer.mask_token} eating my food everyday")

[{'score': 0.26010945439338684,
  'sequence': 'This person has been literally eating my food everyday',
  'token': 5909,
  'token_str': ' literally'},
 {'score': 0.07829903811216354,
  'sequence': 'This person has been enjoying eating my food everyday',
  'token': 6218,
  'token_str': ' enjoying'},
 {'score': 0.06683751195669174,
  'sequence': 'This person has been happily eating my food everyday',
  'token': 16534,
  'token_str': ' happily'},
 {'score': 0.05062520131468773,
  'sequence': 'This person has been busy eating my food everyday',
  'token': 3610,
  'token_str': ' busy'},
 {'score': 0.031358812004327774,
  'sequence': 'This person has been secretly eating my food everyday',
  'token': 16340,
  'token_str': ' secretly'}]

Text Generation

In [50]:
# Text generation
# Smaller max_length ==> Make more sense
text_generator("As far as I am concerned, I will", max_length=20)

Using pad_token, but it is not set yet.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'As far as I am concerned, I will be putting it forward publicly for all the people who have'}]

Named Entity Recognition

In [51]:
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()
sentence = '''Facebook, Inc., is an American multinational conglomerate based in Menlo Park, California. 
            It was founded by Mark Zuckerberg, along with his fellow roommates and students at Harvard College,'''

In [58]:
# Perform named entity recognition
ner = nlp(sentence)
# Print out the results
displacy.render(ner, style="ent", jupyter=True)

## Word Embeddings

In [59]:
# Download embedding layer
import tensorflow_hub as hub
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128/2'.
INFO:absl:Downloaded https://tfhub.dev/google/nnlm-en-dim128/2, Total size: 483.55MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128/2'.


In [61]:
# View embeddings of word
word = "man"
emb_word = embedding_layer([word])
print(emb_word)

tf.Tensor(
[[ 0.02848229  0.04686535 -0.04147879 -0.13354081  0.01703222 -0.05207312
   0.17542455  0.11578733  0.04462362 -0.05776377  0.03012958 -0.01415328
   0.11427145 -0.01487454 -0.01368568 -0.0524796  -0.07193845 -0.072322
   0.11030449 -0.12568177 -0.095378   -0.01750288 -0.07935891 -0.16632013
   0.12703875 -0.12413993  0.06190034 -0.01668076  0.0084015  -0.14488238
   0.17221402 -0.00406017 -0.05207924  0.05142674  0.00294312 -0.06071912
  -0.02484846  0.10623363 -0.04819633  0.08099245  0.02422958  0.03211612
  -0.10465357 -0.01124988 -0.00124846 -0.03559102  0.00424354 -0.01294302
   0.05406577 -0.02603427 -0.01111236 -0.03154766  0.06899226 -0.03130316
   0.11330416 -0.05653518 -0.02899573  0.00729821 -0.06816862 -0.04936227
  -0.07012153  0.10568658 -0.12702192 -0.02522285  0.03975509  0.15204
  -0.10292836  0.02909353  0.05877079 -0.04593015  0.11155143  0.09550941
   0.22503743  0.02992023  0.00568607  0.03421725 -0.02884139  0.09138966
  -0.10314381  0.11350129  0.001

In [62]:
# View embeddings of sentence
sentence = "probably not the best I could come up with; I've done better before"
emb_sentence = embedding_layer([sentence])
print(emb_sentence)

tf.Tensor(
[[ 4.04519975e-01 -8.58920813e-02  1.02527134e-01  3.40802670e-01
   2.62630075e-01 -1.27190039e-01  2.82863621e-02 -8.14228877e-02
  -8.18963051e-02  9.04136151e-02 -1.61296591e-01 -1.12930462e-01
  -1.84006859e-02 -2.41471574e-01 -7.11164623e-02  8.34719092e-02
   2.65555326e-02  3.86329219e-02  4.32475731e-02  2.83730209e-01
   3.48841809e-02  6.27125502e-02 -1.17918644e-02  7.55253881e-02
  -2.60421559e-02  1.56729799e-02  7.83098862e-02  4.18586396e-02
  -9.92093831e-02 -4.58515249e-02  9.09664184e-02  5.98778203e-02
   1.52102917e-01 -3.95471491e-02  4.83815074e-02 -4.54969006e-03
  -4.97848913e-02 -9.36332420e-02 -2.92435661e-02  2.25087211e-01
  -4.46661860e-02 -2.95981579e-02 -2.99884323e-02 -2.13128000e-01
   1.07423812e-01  1.06692448e-01 -8.57891440e-02  9.92878675e-02
   3.55719365e-02  4.00395654e-02  1.96300149e-01  1.76400065e-01
  -2.14213461e-01  3.72939697e-03 -1.86139762e-01  6.93075806e-02
  -2.29576960e-01 -8.15263316e-02  7.99960494e-02 -5.58803901e-02

In [64]:
# Import library for cosine similarity
# scikit-learn
from sklearn.metrics.pairwise import cosine_similarity

In [67]:
# Obtain word embeddings for these 3 words
happy_1 = "great"
happy_2 = "awesome"
sad_1 = "unfortunate"

happy_1_emb = embedding_layer([happy_1])
happy_2_emb = embedding_layer([happy_2])
sad_1_emb = embedding_layer([sad_1])

In [68]:
# View cosine similarity of happy_1 and happy_2
cosine_similarity(happy_1_emb, happy_2_emb)

array([[0.7799858]], dtype=float32)

In [69]:
# View cosine similarity of happy_2 and sad_1
cosine_similarity(happy_2_emb, sad_1_emb)

array([[0.39718285]], dtype=float32)

In [71]:
# Obtain word embeddings for these 3 sentences
happy_sen_1 = "it was great meeting with you today"
happy_sen_2 = "really happy to see you again"
sad_sen_1 = "i wish we didnt see each other"

happy_sen_1_emb = embedding_layer([happy_sen_1])
happy_sen_2_emb = embedding_layer([happy_sen_2])
sad_sen_1_emb = embedding_layer([sad_sen_1])

In [73]:
# View cosine similarity of happy_sen_1_emb and happy_sen_2_emb
cosine_similarity(happy_sen_1_emb, happy_sen_2_emb)

array([[0.62235177]], dtype=float32)

In [74]:
# View cosine similarity of happy_sen_1_emb and sad_sen_1_emb
cosine_similarity(happy_sen_1_emb, sad_sen_1_emb)

array([[0.33715943]], dtype=float32)