In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.layers import Embedding, Lambda, Dense
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.backend import mean
from tensorflow.python.keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
import numpy as np
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [2]:
def strip_lines(line):
    line = line.replace(')', '')
    line = line.replace('(', '')
    line = line.replace('\"', '')
    line = line.replace('\n', '')
    line = line.replace(',', '')
    line = line.replace('\'', '')
    line = line.split()
    return line

In [3]:
sentences = []
with open('pary.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        sentences.append(strip_lines(line))

words = set(word for sentence in sentences for word in sentence)
word2idx = {word: i + 1 for i, word in enumerate(words)}
idx2word = {i: word for word, i in word2idx.items()}

sequences = [[word2idx[word] for word in sentence] for sentence in sentences]

In [4]:
V = len(word2idx) + 1
X = []
Y = []
win_size = 2

for seq in sequences:
    for i in range(len(seq)):
        target_word = seq[i]
        context = []
        for j in range(-win_size + i, win_size + 1 + i):
            if j != i:
                if j < 0 or j >= len(seq):
                    context.append(0)
                else:
                    context.append(seq[j])
        X.append(context)
        Y.append(target_word)

X = np.array(X)
Y = to_categorical(Y, num_classes=V)

In [5]:
X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle=True)

X_val, X_test, Y_val, Y_test = train_test_split(X_rest, Y_rest, test_size=0.3, random_state=42, shuffle=True)

In [6]:
model = Sequential([
            Embedding(input_dim=V,
                      output_dim=50,
                      input_length=2 * win_size,
                      embeddings_initializer='glorot_uniform'),
            Lambda(lambda x: mean(x, axis=1), output_shape=(50, )),
            Dense(V, activation='softmax', kernel_initializer='glorot_uniform')
        ])

checkpoint = ModelCheckpoint(filepath='checkpointy_tft.h5',
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True
                             )

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [7]:
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=2048, callbacks=[checkpoint])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.05930, saving model to checkpointy_tft.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.05930 to 0.06455, saving model to checkpointy_tft.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.06455 to 0.06534, saving model to checkpointy_tft.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.06534 to 0.06549, saving model to checkpointy_tft.h5
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.06549 to 0.06564, saving model to checkpointy_tft.h5
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.06564 to 0.06585, saving model to checkpointy_tft.h5
Epoch 7/10

Epoch 00007: val_accuracy improved from 0.06585 to 0.06591, saving model to checkpointy_tft.h5
Epoch 8/10

Epoch 00008: val_accuracy improved from 0.06591 to 0.06602, saving model to checkpointy_tft.h5
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.06602
Epoch 10/10

Epoch 00010: val_accuracy improved from 0.06602 to 0.06603, saving 

<tensorflow.python.keras.callbacks.History at 0x2689ce60160>

In [8]:
loss, accuracy = model.evaluate(X_test, Y_test)

