In [36]:
# Imports
from os.path import join as join_path
import numpy as np
rng_seed = 368
np.random.seed(rng_seed)
import pandas as pd
from tqdm.notebook import tqdm

import tensorflow as tf
tf.random.set_seed(rng_seed)
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Reshape, dot, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt

# Custom files
from importlib import reload
import utils
reload(utils)

import tensorflow_setup
from utils import load_result, TokenizedSkipgramDataGenerator

In [2]:
tensorflow_setup.init(shutup=True)

Enabled dynamic gpu memory
Ran tensorflow_shutup


## Load and prepare dataset

In [3]:
# Load tokenized english texts (vocab size=10000)
vocab_size = 10000
sampling_window_size = 10
negative_samples = 5
cord_data_tokenized_path = join_path('data', f'cord_19_texts_{vocab_size}_tokenized_en.p')
cord_data, cord_word_to_idx, cord_idx_to_word = load_result(cord_data_tokenized_path)

In [4]:
# Split data into train/val/test
cord_data_train, cord_data_val = train_test_split(cord_data, test_size=0.02, random_state=rng_seed)
cord_data_val, cord_data_test = train_test_split(cord_data_val, test_size=0.5, random_state=rng_seed)

In [None]:
print(f'Train size: {len(cord_data_train)}')
print(f'Validation size: {len(cord_data_val)}')
print(f'Test size: {len(cord_data_test)}')

In [50]:
# Setup data generator
train_data_gen = TokenizedSkipgramDataGenerator(
    cord_data_train,
    vocab_size,
    sampling_window_size,
    negative_samples,
    corpus_batch_size=10,
    pairs_batch_size=1024
)
val_data_gen = TokenizedSkipgramDataGenerator(
    cord_data_val,
    vocab_size,
    sampling_window_size,
    negative_samples,
    corpus_batch_size=10,
    pairs_batch_size=1024
)

## Create word2vec model and train it

In [51]:
# Create word2vec model using Keras
def build_word2vec_model(vocab_size: int, vector_dim: int):
    '''Build word2vec model using Keras/Tensorflow
    
    Args:
        vocab_size: Size of the vocabulary
        vector_dim: Dimension of the embedding weight matrix (300 is typically used)
    
    returns:
        model: Word2vec model
    '''
    # Input to network
    input_target = Input((1,), name='input_target')
    input_context = Input((1,), name='input_context')

    # Embedding layer
    embedding = Embedding(vocab_size + 1, vector_dim, input_length=1, name='embedding')
    target = embedding(input_target)
    target = Reshape((vector_dim, 1), name='target_word_vector')(target)
    context = embedding(input_context)
    context = Reshape((vector_dim, 1), name='context_word_vector')(context)
    
    # Compute similarity (dot product)
    dot_product = dot([target, context], axes=1, normalize=False, name='dot_product')
    dot_product = Reshape((1,), name='dot_product_reshape')(dot_product)
    
    # Sigmoid activation (output)
    output = Dense(1, activation='sigmoid', name='sigmoid_activation')(dot_product)
    
    # Create model
    model = Model(inputs=[input_target, input_context], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [52]:
word_embedding_dim = 300
model = build_word2vec_model(vocab_size, word_embedding_dim)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_target (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_context (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 300)       3000300     input_target[0][0]               
                                                                 input_context[0][0]              
__________________________________________________________________________________________________
target_word_vector (Reshape)    (None, 300, 1)       0           embedding[0][0]            

In [53]:
# Train model
model.fit(
    train_data_gen,
    validation_data=val_data_gen,
    epochs=10,
    verbose=1
)

Train for 7852 steps, validate for 4583 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0968313110>

In [54]:
test_data_gen = TokenizedSkipgramDataGenerator(
    cord_data_test,
    vocab_size,
    sampling_window_size,
    negative_samples,
    corpus_batch_size=10,
    pairs_batch_size=1024
)

In [56]:
y_pred = model.predict(test_data_gen)
y_true = test_data_gen.skipgram_labels
print(f'Test AUC: {roc_auc_score(y_true[:len(y_pred)], y_pred)}')

Test AUC: 0.49986399278383753
