In [None]:
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import make_sampling_table, skipgrams
from tensorflow.random import log_uniform_candidate_sampler as sampler

In [None]:
# creates skipgrams with 1 positive and varing amounts of negative samples using tensorflow,
# wrapped as a pd.Dataframe
def create_skipgrams_with_negative_samples(
    list_sentence, n_size, neg_sam_size, vocab_size, seed, generate_table=True
):
    target_l, sample_l, label_l = [], [], []
    # Sampling Table: Works only with sorted vocab, with 0 as non-word, 1 the most likely word in vocab,..., if no sorting =>
    sampling_table = make_sampling_table(size=vocab_size) if generate_table else None
    # iterating over each sentence in list_sentence
    for sentence in tqdm(list_sentence):
        # create positive n-grams for each word with range n_size
        # if sampling_table = None, each word in range is selected
        # due to negative_samples=0, all pairs are positive skipgrams in range n_size
        # labels are discarded, since they are all positive (1)
        pos_skipgrams, _ = skipgrams(
            sequence=sentence,
            vocabulary_size=vocab_size,
            window_size=n_size,
            shuffle=True,
            sampling_table=sampling_table,
            seed=seed,
            negative_samples=0,
        )
        # for each positiv sample, neg_sam_size negative samples are randomly generated
        for target, context_word in pos_skipgrams:
            # for collection of data: saving of target
            target_l.append(target)
            # for collection of data: saving of labels: 1 positive label and neg_sam_size negative
            label = tf.constant([1] + [0]*neg_sam_size, dtype="int64")
            label_l.append(label)
            
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"),  1)
            # generating unique, negative samples
            negative_sampling_candidates, _, _ = sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=neg_sam_size,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling",
            )
            negative_sampling_candidates = tf.expand_dims(
              negative_sampling_candidates, 1)
            context = tf.concat([context_class, negative_sampling_candidates], 0)
            # for collection of data: saving of context-word: 1 positive context and neg_sam_size context
            sample_l.append(context)
    return target_l, sample_l, label_l