In [1]:
import numpy as np
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Embedding
from keras.callbacks import LambdaCallback
import string

def clean_document(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

# load doc into memory
with open('text.txt') as f:
	text = f.read()

words = clean_document(text)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# One-hot encode all words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)
vocab_size = len(tokenizer.word_index) + 1
one_hot_matrix = tokenizer.texts_to_matrix(words)
# num_examples = len(one_hot_matrix)

# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))
del tokenizer

In [3]:
# Skip-Gram
# prepare examples
X = []
y = []
window_size = 2
for index, row in enumerate(one_hot_matrix):
    for j in range(max(index - window_size, 0), min(index + window_size, len(one_hot_matrix))):
        if index != j:
            X.append(row)
            y.append(one_hot_matrix[j])

# release memory
# del one_hot_matrix

# convert to numpy arrays
X = np.asarray(X)
y = np.asarray(y)

In [4]:
print(X[:3])
print(y[:3])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
model = Sequential([
    Dense(50, input_shape=(vocab_size,)),
    Dense(50),
    Dense(vocab_size, activation='softmax')
])
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# print model's summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                75700     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_3 (Dense)              (None, 1513)              77163     
Total params: 155,413
Trainable params: 155,413
Non-trainable params: 0
_________________________________________________________________


In [6]:
# fit model
model.fit(X, y, batch_size=100, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ffcc00f978>

In [8]:
print(model.get_weights()[0].shape)
print(model.get_weights()[1].shape)
print(model.get_weights()[2].shape)
print(model.get_weights()[3].shape)
dump(model.get_weights()[0], open('embeddings.pkl', 'wb'))

(1513, 50)
(50,)
(50, 50)
(50,)


In [9]:
import pickle
# load the tokenizer
tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))

In [10]:
model.predict_classes(X[tokenizer.word_index['is'] - 1 : tokenizer.word_index['is']])[0]

3