Make this notebook deterministic.

In [1]:
RANDOM_SEED = 0

# Python RNG
import random
random.seed(RANDOM_SEED)

# Numpy RNG
import numpy as np
np.random.seed(RANDOM_SEED)

# TF RNG
import tensorflow as tf
from tensorflow.python.framework import random_seed
random_seed.set_seed(RANDOM_SEED)

Import the necessary modules.

In [2]:
from xswem.model import XSWEM
!pip install datasets
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.data import Dataset



Load and shuffle the dataset. We keep 10% of the training set for validation.

In [3]:
ag_news = load_dataset('ag_news')
ag_news = ag_news.shuffle({"train":RANDOM_SEED,"test":RANDOM_SEED})
ag_news["train"] = ag_news["train"].train_test_split(test_size=0.1,seed=RANDOM_SEED)
ag_news_train, ag_news_valid = ag_news["train"]["train"], ag_news["train"]["test"]
X, y = ag_news_train["text"], ag_news_train["label"]
X_valid, y_valid = ag_news_valid["text"], ag_news_valid["label"]
ag_news_test = ag_news["test"]
X_test, y_test = ag_news_test["text"], ag_news_test["label"]

Using custom data configuration default
Reusing dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a/cache-9edf62a6acdef7c2.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a/cache-005431702e078c3e.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a/cache-8fa4e42aef0940f7.arrow and /root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a/cache-0aa5eb9aa593fda3.arrow


Build the tokenizer.

In [4]:
NUM_WORDS = 16000
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token="UNK")
tokenizer.fit_on_texts(X)
vocab_map = {i+1: tokenizer.index_word[i+1] for i in range(NUM_WORDS)}
output_map = {0: "World", 1: "Sport", 2: "Business", 3: "Tech"}

Build the dataset pipeline.

In [5]:
BATCH_SIZE = 32
NUM_LABELS = len(output_map)

train_dataset = Dataset.from_tensor_slices((X,y))
valid_dataset = Dataset.from_tensor_slices((X_valid,y_valid))
test_dataset = Dataset.from_tensor_slices((X_test,y_test))

# shuffle the train datasets
train_dataset = train_dataset.shuffle(BATCH_SIZE*2)

# tokenize the text and one hot encode the labels
# we only keep unique tokens as XSWEM is invariant to token frequency and order
tokenize = lambda text, label: (tf.py_function(lambda text: np.unique(tokenizer.texts_to_sequences([str(text.numpy())])[0]), inp=[text], Tout=tf.int32), tf.one_hot(label,NUM_LABELS))
train_dataset = train_dataset.map(tokenize,num_parallel_calls=tf.data.experimental.AUTOTUNE)
valid_dataset = valid_dataset.map(tokenize,num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.map(tokenize,num_parallel_calls=tf.data.experimental.AUTOTUNE)

# pre-fetch so that GPU spends less time waiting
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = valid_dataset.prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# padded batch allows us to handle varying sentence lengths
train_dataset = train_dataset.padded_batch(BATCH_SIZE,padded_shapes=([None],[NUM_LABELS]))
valid_dataset = valid_dataset.padded_batch(BATCH_SIZE,padded_shapes=([None],[NUM_LABELS]))
test_dataset = test_dataset.padded_batch(BATCH_SIZE,padded_shapes=([None],[NUM_LABELS]))

Build XSWEM model.

In [6]:
model = XSWEM(128, "softmax", vocab_map, output_map, mask_zero=True, dropout_rate=0.5)
optimizer = tf.keras.optimizers.SGD(learning_rate=1)
model.compile(optimizer, loss="categorical_crossentropy", metrics="accuracy")

Train XSWEM model.

In [7]:
model.fit(train_dataset, validation_data=valid_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3e8ee42be0>

Test XSWEM model.

In [8]:
model.evaluate(test_dataset)



[0.2812957763671875, 0.9115789532661438]