In [None]:
# imports
import csv
import pandas as pd
import import_ipynb
import create_skipgrams
import glob
import gc
import spacy
import io
nlp = spacy.load(
    "de_core_news_lg", exclude=["tok2vec", "ner", "parser", "attribute_ruler"]
)
from tqdm import tqdm
import datetime
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import make_sampling_table
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten

In [None]:
# model
# 4 layers

class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim, negative_samples):
        super(Word2Vec, self).__init__()
        
        # 1.) Embedding layer: gets embedding for the target word
        self.target_embedding = Embedding(
            vocab_size, embedding_dim, input_length=1, name="w2v_embedding"
        )
        # 2.) Embedding layer: gets embedding for the context word
        self.context_embedding = Embedding(
            vocab_size, embedding_dim, input_length=negative_samples + 1
        )
        # 3.) Dot-Product for target and context embedding
        self.dots = Dot(axes=(3, 2))
        # 4.) flatten (embedding dim * embeddings dim -> embeddings dim)
        self.flatten = Flatten()

    def call(self, pair):
        target, context = pair
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        dots = self.dots([context_emb, word_emb])
        return self.flatten(dots)

In [1]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [None]:
# function to create word embedding, with data (as number vector), vocab with word assotiated with its number,
# skipgram-size, negative samplge and dim as parameters for the embedding (as list for the creation of multiple embeddings from the same data),
# label as the name for saving and a seed
def create_word_embeddings(
    data, vocab, skipgram_size, negative_samples, dim, labeling, seed=1234, epochs=25
):
    # checking inputs are lists
    assert (
        isinstance(skipgram_size, list)
        and isinstance(negative_samples, list)
        and isinstance(dim, list)
    ), "skipgram_size, negative_samples, dim are supposed to be lists"

    AUTOTUNE = tf.data.AUTOTUNE
    max_len_sen = max(len(elem) for elem in data)
    vocab_size = len(vocab)

    # padding sentences
    sentences_padded = tf.keras.preprocessing.sequence.pad_sequences(
        data, maxlen=max_len_sen, dtype=int, padding="post", value=0
    )
    history = []

    # creating skipgrams
    for sg_size in skipgram_size:
        for ns in negative_samples:
            target, sample, label = [],[],[]
            for b in batch(data, 10000):
                t,s,l  = create_skipgrams.create_skipgrams_with_negative_samples(
                b, sg_size, ns, vocab_size, seed, generate_table=True
                )
                target = target + t
                sample = sample + s
                label = label + l
                gc.collect()
            print(
                "generated data for skipgram: "
                + str(sg_size)
                + " and negative samples: "
                + str(ns)
            )

            BATCH_SIZE = 1024
            BUFFER_SIZE = 10000

            # creating dataset, shuffling and prefetching
            dataset = tf.data.Dataset.from_tensor_slices(((target, sample), label))
            dataset = dataset.shuffle(BUFFER_SIZE).batch(
                BATCH_SIZE, drop_remainder=True
            )
            dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

            for em_size in dim:

                # embedding-model
                word2vec = Word2Vec(vocab_size, em_size, ns + 1)
                log_dir = (
                    "logs/fit/"
                    + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
                    + "dim{dim}dnegsamp{ns}skipgram{sks}".format(
                        dim=em_size, ns=ns, sks=sg_size
                    )
                )
                tensorboard_callback = tf.keras.callbacks.TensorBoard(
                    log_dir=log_dir, histogram_freq=1
                )
                print(
                    "started compilation with dim{dim}, negative_samples: {ns}, skipgram_size: {sks}".format(
                        dim=em_size, ns=ns, sks=sg_size
                    )
                )

                # create embedding
                word2vec.compile(
                    optimizer="adam",
                    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    metrics=["accuracy"],
                )
                h = word2vec.fit(
                    dataset, epochs=epochs, callbacks=[tensorboard_callback]
                )
                history.append(h)

                # get resulting vectors
                weights = word2vec.get_layer("w2v_embedding").get_weights()[0]

                # saving embedding
                vec_string = (
                    labeling + str(em_size) + "d-" + str(sg_size) + "-" + str(ns)
                )
                out_v = io.open(
                    "embeddings\\vector_" + vec_string + ".tsv", "w", encoding="utf-8"
                )
                out_m = io.open(
                    "embeddings\\metadata_" + vec_string + ".tsv", "w", encoding="utf-8"
                )
                for index, word in enumerate(tqdm(vocab)):
                    vec = weights[index]
                    out_v.write("\t".join([str(x) for x in vec]) + "\n")
                    out_m.write(word + "\n")
                out_v.close()
                out_m.close()
    # returning history
    return history