In [857]:
# Imports
import os
from os.path import join as join_path
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
import pickle
import pandas as pd
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize

import tensorflow as tf
tf.random.set_seed(rng_seed)
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.sequence import skipgrams, make_sampling_table
from tensorflow.keras.utils import plot_model, to_categorical, Progbar
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
AUTOTUNE = tf.data.experimental.AUTOTUNE

import multiprocessing
num_cores = multiprocessing.cpu_count()

from models import build_word2vec_model

In [2]:
# Define constants
# ----------------
data_dir = 'data'
checkpoints_dir = 'checkpoints'
fil9_data_tokenizer_config_path = join_path(data_dir, 'fil9-tokenizer.json')
fil9_data_sentences_path = join_path(data_dir, 'fil9-sents.p')
fil9_data_sequences_path = join_path(data_dir, 'fil9-seqs.p')
os.makedirs(checkpoints_dir, exist_ok=True)

# Training constants
sampling_window_size = 5
num_negative_samples = 15
min_word_count = 5
embedding_dim = 300
epochs = 20
# ----------------

# Load data

In [3]:
# Load fil9 sentences from data preprocessing
with open(fil9_data_sentences_path, 'rb') as file:
    fil9_sents = pickle.load(file)
num_sents = len(fil9_sents)

# Load fil9 sequences from data preprocessing
with open(fil9_data_sequences_path, 'rb') as file:
    fil9_seqs = pickle.load(file)
    
print('Reading vocabulary...')

# Read tokenizer from file
with open(fil9_data_tokenizer_config_path, 'r') as file:
    tokenizer = tokenizer_from_json(file.read())
print('Done!')

Reading vocabulary...
Done!


In [597]:
num_sents

6245051

In [607]:
tokenizer.index_word[0]

KeyError: 0

In [598]:
len(tokenizer.word_index)

840420

## Create data generator

In [647]:
class SkipgramNegativeSamplingDataGenerator():
    '''
    TODO: Docs
    '''
    
    def __init__(
        self,
        texts: list,
        tokenizer: Tokenizer,
        shuffle_size: int,
        batch_size: int,
        epochs: int,
        sampling_window_size: int,
        num_negative_samples: int,
        sampling_factor: float = 1e-5
    ):
        '''
        TODO: Docs
        '''
        self.texts = texts
        self.tokenizer = tokenizer
        self.shuffle_size = shuffle_size
        self.batch_size = batch_size
        self.epochs = epochs
        
        self.vocab_size = len(tokenizer.word_index)
        self.sampling_table = make_sampling_table(self.vocab_size, sampling_factor=sampling_factor)
        self.sampling_window_size = sampling_window_size
        self.num_negative_samples = num_negative_samples
    
    def text_to_sequence(self, text_tensor: tf.Tensor):
        '''
        TODO: Docs
        '''
        # Extract text from tensor
        text_str = text_tensor.numpy().decode('utf8')
        
        # Convert text to sequence
        [sequence] = self.tokenizer.texts_to_sequences([text_str])
        
        # Create skipgram pairs
        skipgram_pairs, skipgram_labels = skipgrams(
            sequence,
            self.vocab_size,
            sampling_table=self.sampling_table,
            window_size=self.sampling_window_size,
            negative_samples=self.num_negative_samples
        )
        skipgram_pairs = np.array(skipgram_pairs)
        skipgram_labels = np.array(skipgram_labels).reshape(-1, 1)
        
        return skipgram_pairs, skipgram_labels
    
    def text_to_sequence_wrapper(self, text_tensor: tf.Tensor):
        '''
        TODO: Docs
        '''
        # Convert text to sequence
        (skipgram_pairs, skipgram_labels) = tf.py_function(
            func=self.text_to_sequence,
            inp=[text_tensor],
            Tout=[tf.int32, tf.int32]
        )
        print(skipgram_pairs, skipgram_labels)
        
        return skipgram_pairs, skipgram_labels #tf.data.Dataset.from_tensor_slices(sequence)
    
    def sequences_to_skipgram_ns_pairs(self, sequences_tensor: tf.Tensor):
        '''
        TODO: Docs
        '''
        return sequences_tensor
        #return tf.data.Dataset.from_tensor_slices(sequences_tensor)
    
    def create(self):
        '''
        TODO: Docs
        '''
        # Create tf.data dataset
        dataset = tf.data.Dataset.from_tensor_slices(self.texts)
        dataset = dataset.map(
            self.text_to_sequence_wrapper,
            num_parallel_calls=AUTOTUNE
        )
        dataset = dataset.shuffle(self.shuffle_size)
        #dataset.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=self.batch_size))
        dataset = dataset.batch(self.batch_size)
        #dataset = dataset.map(
        #    self.sequences_to_skipgram_ns_pairs,
        #    num_parallel_calls=AUTOTUNE
        #)
        #dataset = dataset.repeat(self.epochs)
        #dataset = dataset.prefetch(buffer_size=AUTOTUNE)
        
        return dataset

In [648]:
data_gen = SkipgramNegativeSamplingDataGenerator(
    texts=fil9_sents[:10],
    tokenizer=tokenizer,
    shuffle_size=10,
    batch_size=10,
    epochs=epochs,
    sampling_window_size=5,
    num_negative_samples=10
)
data_gen = data_gen.create()

Tensor("EagerPyFunc:0", dtype=int32, device=/job:localhost/replica:0/task:0) Tensor("EagerPyFunc:1", dtype=int32, device=/job:localhost/replica:0/task:0)


In [649]:
next_batch = data_gen.as_numpy_iterator().next()
print(next_batch)

InvalidArgumentError: Cannot batch tensors with different shapes in component 0. First element had shape [726,2] and element 1 had shape [594,2].

In [588]:
fil9_sents[:2]

['anarchism originated term abuse first used early working class radical including digger english revolution sans culotte french revolution',
 'whilst term still used pejorative way describe act used violent mean destroy organization society also taken positive label self defined anarchist']

In [589]:
for i, batch in enumerate(data_gen):
    print(i, batch)

0 tf.Tensor(
[ 8578  3164   114  2777    23    40   104   704   313  2148   118 18521
   111   896 11135 68524   141   896], shape=(18,), dtype=int32)
1 tf.Tensor(
[3503  114  159   40 9957  151 1662  264   40 2768  275 3044  499  344
   12  482 1761 1412  622  991 4619], shape=(21,), dtype=int32)
2 tf.Tensor([  163  8578  1269   415   268 49480  1602   636    97], shape=(9,), dtype=int32)
3 tf.Tensor([8578  210  871  832 1602 8088 4254  130 7334 1820  275], shape=(11,), dtype=int32)
4 tf.Tensor([8578   12 1130  522  393  365 2713 7085 9867 1203  640   17], shape=(12,), dtype=int32)
5 tf.Tensor(
[  163 11243  4619    70  6720  4626 21995 53919   383 24082   635  9867
   344], shape=(13,), dtype=int32)
6 tf.Tensor(
[   99  1365  9867   210   503 19072   538  1203  4619  2713   393   802
   149   426  6365   620  3810   110  4407  1228   622  7037], shape=(22,), dtype=int32)
7 tf.Tensor([8578 1488  991 4619   12 1105 1761 2141  965 4068  303  344], shape=(12,), dtype=int32)
8 tf.Tensor(


In [883]:
from importlib import reload
import SGNSDataLoader
reload(SGNSDataLoader)
from SGNSDataLoader import SGNSDataLoader

In [884]:
data_loader = SGNSDataLoader(
    texts=[fil9_sents[0]],
    tokenizer=tokenizer,
    batch_size=10,
    n_epochs=10,
    sampling_window_size=2,
    num_negative_samples=10,
    one_hot_skipgram_pairs=True
)
data_gen = data_loader()

In [885]:
for batch in data_gen:
    #print(batch.numpy().shape)
    print(batch)
    break

({'input_target': <tf.Tensor: shape=(10, 840421), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, 'input_context': <tf.Tensor: shape=(10, 840421), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>}, <tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>)


In [886]:
# Test training
model = build_word2vec_model(len(tokenizer.word_index), 300)

In [887]:
model.fit(data_gen, epochs=10)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [868]:
model.layers[2].weights[0].numpy().shape

(840421, 300)