In [2]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras import Sequential, Input, layers
import tensorflow_datasets as tfds
from gensim.models import Word2Vec
import gensim.downloader

# (3) Custom embedding with layers.Embedding

Find an embedding that is specifically designed for your task

In [3]:
# Our RNN input will look like this tensor
# X.shape = (n_sentences, max_sentence_length, embedding_dim)

# Let's create some mock dat
def get_mock_up_data():
    sentence_1 = 'Deep learning is super easy'
    sentence_2 = 'Deep learning was super bad and too long'
    sentence_3 = 'This is the best lecture of the camp!'

    X = [sentence_1, sentence_2, sentence_3]
    y = np.array([1., 0., 0.])

    # Let's tokenize the vocabulary
    tk = Tokenizer()
    tk.fit_on_texts(X)
    vocab_size = len(tk.word_index)
    
    print(f'There are {vocab_size} different words in your corpus')
    X_token = tk.texts_to_sequences(X)

    # Pad the inputs
    X_pad = pad_sequences(X_token, dtype='float32', padding='post')

    return X_pad, y, vocab_size

X_pad, y, vocab_size = get_mock_up_data()
print("X_pad.shape", X_pad.shape)
X_pad

There are 16 different words in your corpus
X_pad.shape (3, 8)


array([[ 1.,  2.,  3.,  4.,  6.,  0.,  0.,  0.],
       [ 1.,  2.,  7.,  4.,  8.,  9., 10., 11.],
       [12.,  3.,  5., 13., 14., 15.,  5., 16.]], dtype=float32)

In [4]:
# Embedding(input_dim=VOCAB_SIZE, input_length=MAX_SENTENCE_LENGTH, output_dim=EMBED_DIM, mask_zero=True)

# Size of your embedding space = size of the vector representing each word
embedding_size = 100

model = Sequential()
model.add(Input(shape=X_pad.shape[1:]))
model.add(layers.Embedding(
    input_dim=vocab_size+1, # 16 + 1 for the 0 padding
    output_dim=embedding_size,
    mask_zero=True # Built-in masking layer
))

model.add(layers.LSTM(20))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

In [5]:
print(f'Expected number of parameters in Embedding: {(vocab_size+1) * embedding_size}')

Expected number of parameters in Embedding: 1700


In [6]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_pad, y, epochs=5, batch_size=16, verbose=0)

<keras.src.callbacks.history.History at 0x2223a74f860>

# (4.2) Word2vec: Implementation with Gensim

In [7]:
# Let's get some text first
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)

train_sentences, train_labels = tfds.as_numpy(train_data)
test_sentences, test_labels = tfds.as_numpy(test_data)

# Let's check two sentences
train_sentences[0:2]

# We have to convert the sentences into list of words! The computer won't do it for us







array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot 

In [8]:
# Let's convert the list of sentences to a list of lists of words with a Keras utility function
X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]

X_train[0:2]

[['this',
  'was',
  'an',
  'absolutely',
  'terrible',
  'movie',
  "don't",
  'be',
  'lured',
  'in',
  'by',
  'christopher',
  'walken',
  'or',
  'michael',
  'ironside',
  'both',
  'are',
  'great',
  'actors',
  'but',
  'this',
  'must',
  'simply',
  'be',
  'their',
  'worst',
  'role',
  'in',
  'history',
  'even',
  'their',
  'great',
  'acting',
  'could',
  'not',
  'redeem',
  'this',
  "movie's",
  'ridiculous',
  'storyline',
  'this',
  'movie',
  'is',
  'an',
  'early',
  'nineties',
  'us',
  'propaganda',
  'piece',
  'the',
  'most',
  'pathetic',
  'scenes',
  'were',
  'those',
  'when',
  'the',
  'columbian',
  'rebels',
  'were',
  'making',
  'their',
  'cases',
  'for',
  'revolutions',
  'maria',
  'conchita',
  'alonso',
  'appeared',
  'phony',
  'and',
  'her',
  'pseudo',
  'love',
  'affair',
  'with',
  'walken',
  'was',
  'nothing',
  'but',
  'a',
  'pathetic',
  'emotional',
  'plug',
  'in',
  'a',
  'movie',
  'that',
  'was',
  'devoid',

In [9]:
# This line trains an entire embedding for the words in your train set
word2vec = Word2Vec(sentences=X_train, vector_size=10)

# Let's take a look at the representation of any word
word2vec.wv['hello']

array([ 0.12063502,  0.03280268,  0.60021216, -0.41862488,  0.06843155,
        0.3019776 , -0.02993482, -0.5249786 , -1.3902066 , -1.1858629 ],
      dtype=float32)

In [10]:
# Now let's look at the 10 closest words to `movie`
word2vec.wv.most_similar('movie', topn=10)

[('film', 0.9694231748580933),
 ('one', 0.9251149892807007),
 ('thing', 0.9249079823493958),
 ('sequel', 0.9088228344917297),
 ('still', 0.9034302830696106),
 ('it', 0.8977356553077698),
 ('comment', 0.8869630694389343),
 ('word', 0.8842864632606506),
 ('fun', 0.8837607502937317),
 ('effort', 0.8834584355354309)]

In [11]:
# To control the size of the embedding space, use the `vector_size` keyword

# We keep the training short by taking only 1000 sentences
word2vec = Word2Vec(sentences=X_train[:1000], vector_size=50)

len(word2vec.wv['computer'])

50

In [12]:
# Word2Vec learns a representation for words that are present more than `min_count` times
# This is to prevent learning representations based on a few examples only

word2vec = Word2Vec(sentences=X_train[:1000], vector_size=50, min_count=5)

try:
    len(word2vec.wv['columbian'])
except:
    print("Word seen less than 5 times, and is thus excluded from corpus")

Word seen less than 5 times, and is thus excluded from corpus


In [14]:
# As mentioned earlier, Word2Vec trains an internal neural network.
# The goal of this network is to predict a word in a corpus based on the words around it. 
# This part of the sentence is called the window.
# The window size is the number of words around word W used to predict word W.

word2vec = Word2Vec(sentences=X_train[:10000], vector_size=100, window=5, min_count=1)

## Pre-trained Word2Vec (transfer learning)

In [15]:
# Instead of training it on your training set (especially if it is very small), 
# you can directly load a pretrained embedding

print(list(gensim.downloader.info()['models'].keys()))

model_wiki = gensim.downloader.load('glove-wiki-gigaword-50')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [16]:
model_wiki.most_similar('movie', topn=10)

[('movies', 0.9322481155395508),
 ('film', 0.9310100078582764),
 ('films', 0.8937394618988037),
 ('comedy', 0.8902585506439209),
 ('hollywood', 0.8718216419219971),
 ('drama', 0.8341657519340515),
 ('sequel', 0.8222616314888),
 ('animated', 0.8216581344604492),
 ('remake', 0.812495768070221),
 ('show', 0.8105834126472473)]

## Arithmetic on words

In [17]:
word2vec = Word2Vec(sentences=X_train[:10000], vector_size=30, window=2, min_count=10)

v_queen = word2vec.wv['queen']
v_king = word2vec.wv['king']
v_man = word2vec.wv['man']

v_result = v_queen - v_king + v_man

# Arithmetic directly on words
word2vec.wv.similar_by_vector(v_result)

[('woman', 0.9082302451133728),
 ('girl', 0.9080888032913208),
 ('man', 0.8935427665710449),
 ('guy', 0.855695903301239),
 ('cop', 0.8050817251205444),
 ('boy', 0.7836461663246155),
 ('doctor', 0.7555070519447327),
 ('town', 0.7552664875984192),
 ('lady', 0.7375656366348267),
 ('victim', 0.6935808062553406)]

# (5) CNNs for NLP

In [21]:
# 1D convolutions is the answer
# Convoluções unidimensionais (layers.Conv1D(...)) fazem exatamente o que você quer: são
# convoluções que "deslizam" ao longo do eixo da palavra, palavra por palavra.

# RNN
rnn = Sequential([
    Input(shape=X_pad.shape[1:]),
    layers.Embedding(input_dim=5000, output_dim=30, mask_zero=True),
    layers.LSTM(20),
    layers.Dense(1, activation="sigmoid")
])

# Conv1D
cnn = Sequential ([
    Input(shape=X_pad.shape[1:]),
    layers.Embedding(input_dim=5000, output_dim=30, mask_zero=True),
    layers.Conv1D(20, kernel_size=3),
    layers.Flatten(),
    layers.Dense(1, activation="sigmoid"),
])

print(rnn.summary())
print(cnn.summary())



None


None
