<a href="https://colab.research.google.com/github/LCaravaggio/NLP/blob/main/03_redes/CBOW_y_SkipGram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CBOW

In [1]:
from nltk.corpus import gutenberg
import nltk
import numpy as np
from string import punctuation
import re

In [2]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
stop_words = nltk.corpus.stopwords.words('english')

In [4]:
bible = gutenberg.sents("bible-kjv.txt")
remove_terms = punctuation + '0123456789'

In [5]:
bible

[['[', 'The', 'King', 'James', 'Bible', ']'], ['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible'], ...]

In [6]:
wpt = nltk.WordPunctTokenizer()

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc,re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [7]:
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

In [8]:
norm_bible[:5]

['king james bible',
 'old testament king james bible',
 'first book moses called genesis',
 'beginning god created heaven earth',
 'earth without form void darkness upon face deep']

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence


tokenizer = Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

In [10]:
# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text_to_word_sequence(doc)] for doc in norm_bible]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 12425
Vocabulary Sample: [('shall', 1), ('unto', 2), ('lord', 3), ('thou', 4), ('thy', 5), ('god', 6), ('ye', 7), ('said', 8), ('thee', 9), ('upon', 10)]


In [11]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            start = max(0, index - window_size)
            end = min(sentence_length, index + window_size + 1)
            context_words = [words[i] for i in range(start, end) if i != index]
            label_word = word

            if len(context_words) == 0:
                continue

            x = pad_sequences([context_words], maxlen=context_length, padding='post', truncating='post')
            y = to_categorical([label_word], num_classes=vocab_size)
            yield x, y

In [12]:
import tensorflow as tf

# build CBOW architecture
cbow = tf.keras.Sequential()
cbow.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size))
cbow.add(tf.keras.layers.Lambda(lambda x: tf.keras.backend.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(tf.keras.layers.Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         1242500   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 12425)             1254925   
Total params: 2,497,425
Trainable params: 2,497,425
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
#Este entrenamiento tarda unas cuantas horas. Más incluso que la cuota libre de Colab. Corre local y no en clase

for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)

        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 1 	Loss: 4286634.107102927

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 2 	Loss: 5572901.875400365

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 3 	Loss: 5602049.088066552

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 4 	Loss: 5306262.972947944

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs
Epoch: 5 	Loss: 4765571.131131787



In [14]:
import pandas as pd
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(12424, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
unto,-2.223604,2.86614,-2.132128,-2.797038,2.601964,2.681766,-2.872133,-2.58319,-2.300686,-2.342377,...,-2.618478,-2.057792,-3.091557,-2.620062,-3.111323,-2.372203,-2.468651,2.786049,-3.365383,-2.642241
lord,-1.97176,2.02588,-1.871813,-2.63096,1.844649,2.134659,-2.308367,-2.249089,-1.853627,-2.492036,...,-2.294261,-1.557231,-2.767344,-1.919199,-2.471757,-2.223515,-2.739373,2.338367,-2.891318,-2.369243
thou,-1.575344,1.881439,-2.076868,-2.321306,1.814253,2.600374,-2.433875,-2.268668,-1.838522,-1.919915,...,-2.383743,-2.052448,-2.069975,-2.088795,-1.974004,-2.089131,-1.774704,2.263765,-2.727866,-1.878055
thy,-1.868516,2.095453,-2.16252,-2.344458,2.252535,2.563588,-2.368776,-2.587917,-1.901774,-2.274439,...,-2.156907,-2.147224,-2.367692,-2.516188,-2.642004,-2.152353,-2.485777,2.384677,-3.720293,-2.28633
god,-1.902769,2.590968,-2.166515,-2.294959,2.159779,2.387039,-2.059349,-2.666387,-2.395756,-2.15142,...,-2.238323,-2.098771,-2.594616,-2.02505,-2.38227,-2.340272,-2.612186,2.441282,-2.854055,-2.107414


In [15]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                   for search_term in ['god', 'jesus', 'noah', 'egypt', 'john', 'gospel', 'moses','famine']}

similar_words

(12424, 12424)


{'god': ['ye', 'also', 'made', 'unto', 'one'],
 'jesus': ['spirit', 'time', 'many', 'cast', 'heaven'],
 'noah': ['flood', 'kind', 'uncleanness', 'likeness', 'birds'],
 'egypt': ['smote', 'cut', 'servants', 'princes', 'anger'],
 'john': ['peter', 'knew', 'whether', 'entered', 'new'],
 'gospel': ['hope', 'grace', 'hearts', 'entered', 'sound'],
 'moses': ['kept', 'power', 'prophets', 'whole', 'received'],
 'famine': ['fallen', 'slay', 'consumed', 'edom', 'rulers']}

# Skip-Gram

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(norm_bible)

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1


wids = [[word2id[w] for w in text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:5])

Vocabulary Size: 12425
Vocabulary Sample: [('shall', 1), ('unto', 2), ('lord', 3), ('thou', 4), ('thy', 5)]


In [20]:
from tensorflow.keras.preprocessing.sequence import skipgrams

# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0],
          id2word[pairs[i][1]], pairs[i][1],
          labels[i]))

(bible (5766), king (13)) -> 1
(bible (5766), drove (2932)) -> 0
(bible (5766), james (1154)) -> 1
(james (1154), king (13)) -> 1
(james (1154), sorcery (11693)) -> 0
(king (13), weaken (10773)) -> 0
(james (1154), bible (5766)) -> 1
(king (13), james (1154)) -> 1
(king (13), shecaniah (7503)) -> 0
(james (1154), ithream (7275)) -> 0


In [30]:
# build skip-gram architecture
embed_size = 100
word_model = tf.keras.Sequential()
word_model.add(tf.keras.layers.Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
word_model.add(tf.keras.layers.Reshape((embed_size, )))

context_model = tf.keras.Sequential()
context_model.add(tf.keras.layers.Embedding(vocab_size, embed_size,
                  embeddings_initializer="glorot_uniform",
                  input_length=1))
context_model.add(tf.keras.layers.Reshape((embed_size,)))

merged_output = tf.keras.layers.add([word_model.output, context_model.output])

model_combined = tf.keras.Sequential()
model_combined.add(tf.keras.layers.Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))

final_model = tf.keras.models.Model([word_model.input, context_model.input], model_combined(merged_output))
final_model.compile(loss="mean_squared_error", optimizer="rmsprop")

final_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_7_input (InputLayer)  [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_8_input (InputLayer)  [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 100)       1242500     embedding_7_input[0][0]          
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1, 100)       1242500     embedding_8_input[0][0]          
______________________________________________________________________________________________

In [33]:
for epoch in range(1, 3):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += final_model.train_on_batch(X,Y)

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Processed 10000 (skip_first, skip_second, relevance) pairs
Processed 20000 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 3634.3396557169035
Processed 0 (skip_first, skip_second, relevance) pairs
Processed 10000 (skip_first, skip_second, relevance) pairs
Processed 20000 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 3199.733184985118


In [34]:
from sklearn.metrics.pairwise import euclidean_distances
word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0][1:]

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                   for search_term in ['god', 'jesus','egypt', 'john', 'famine']}

similar_words

(12424, 12424)


{'god': ['man', 'way', 'lord', 'upon', 'stand'],
 'jesus': ['marvel', 'stir', 'offend', 'searcheth', 'imputed'],
 'egypt': ['drew', 'mount', 'gilead', 'battle', 'wickedness'],
 'john': ['pleasures', 'council', 'denied', 'angels', 'devil'],
 'famine': ['strange', 'tents', 'guard', 'pay', 'army']}