In [115]:
from tensorflow.python.keras.layers import Embedding, Dot, Input, Flatten
from tensorflow.python.keras.models import Model
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.callbacks import ModelCheckpoint
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.io as pio

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
def strip_lines(line):
    line = line.replace(')', '')
    line = line.replace('(', '')
    line = line.replace('\"', '')
    line = line.replace('\n', '')
    line = line.replace(',', '')
    line = line.replace('\'', '')
    line = line.split()
    return line

In [4]:
sentences = []
with open('pary.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        sentences.append(strip_lines(line))

In [5]:
words = set(word for sentence in sentences for word in sentence)
word2idx = {word: i + 1 for i, word in enumerate(words)}
idx2word = {i: word for word, i in word2idx.items()}

In [27]:
def generate_training_data(sentences, word2idx, window_size, num_negative_samples):
    X, Y, vec = [], [], []
    for sentence in sentences:
        sentence = [word2idx[word] for word in sentence]
        for i, target_word in enumerate(sentence):
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if i != j:
                    X.append(target_word)
                    Y.append(sentence[j])
                    vec.append(1)
                    negative_samples = np.random.choice(list(word2idx.values()), size=num_negative_samples)
                    for neg_word in negative_samples:
                        X.append(target_word)
                        Y.append(neg_word)
                        vec.append(0)
    return np.array(X), np.array(Y), vec

In [28]:
X, Y, vec = generate_training_data(sentences, word2idx, window_size=3, num_negative_samples=1)
V = len(words)

In [31]:
X_train, X_rest, Y_train, Y_rest, vec_train, vec_rest = train_test_split(X, Y, vec, test_size=0.3, random_state=42)
X_test, X_val, Y_test, Y_val, vec_test, vec_val = train_test_split(X_rest, Y_rest, vec_rest, test_size=0.3, random_state=42)

input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(V, 50, input_length=1, name='embedding')
target_embedding = embedding(input_target)
context_embedding = embedding(input_context)

dot_product = Dot(axes=-1)([target_embedding, context_embedding])
output = Flatten()(dot_product)

model = Model(inputs=[input_target, input_context], outputs=output)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint = ModelCheckpoint(filepath='checkpointy_tft_negative.h5',
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True
                             )

In [33]:
model.fit([X_train, Y_train], np.array(vec_train),
          validation_data=([X_val, Y_val], np.array(vec_val)),
         epochs=50, batch_size=4096, callbacks=[checkpoint])

Epoch 1/50

Epoch 00001: val_accuracy improved from 0.76139 to 0.76231, saving model to checkpointy_tft_negative.h5
Epoch 2/50

Epoch 00002: val_accuracy improved from 0.76231 to 0.76360, saving model to checkpointy_tft_negative.h5
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.76360
Epoch 4/50

Epoch 00004: val_accuracy improved from 0.76360 to 0.76454, saving model to checkpointy_tft_negative.h5
Epoch 5/50

Epoch 00005: val_accuracy improved from 0.76454 to 0.76617, saving model to checkpointy_tft_negative.h5
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.76617
Epoch 7/50

Epoch 00007: val_accuracy improved from 0.76617 to 0.76786, saving model to checkpointy_tft_negative.h5
Epoch 8/50

Epoch 00008: val_accuracy improved from 0.76786 to 0.76846, saving model to checkpointy_tft_negative.h5
Epoch 9/50

Epoch 00009: val_accuracy improved from 0.76846 to 0.76898, saving model to checkpointy_tft_negative.h5
Epoch 10/50

Epoch 00010: val_accuracy improved from 0

<tensorflow.python.keras.callbacks.History at 0x204baaf2ac0>

In [34]:
loss, accuracy = model.evaluate([X_test, Y_test], np.array(vec_test))



In [144]:
pio.renderers.default = 'iframe'

In [147]:
weights = embedding.get_weights()[0]
labels = np.arange(1, 449)

pca = PCA(n_components = 1)
data = pca.fit_transform(weights)
data = np.array(data[:, 0])

trace = go.Scatter(
    x=labels,
    y=data,
    mode='markers',
    marker=dict(
        color='rgba(0, 0, 200, .8)',
        size=10
    ),
    hoverinfo='text',
    text=[f'Word: {w}<br>Embedding: {e}'
          for w, e in zip(idx2word.values(), data)]
)

layout = go.Layout(
    title='PCA',
    xaxis=dict(title='Word'),
    yaxis=dict(title='Embedding'),
    hovermode='x',
)

fig = go.Figure(data=[trace], layout=layout)

fig.show()