In [1]:
import keras_nlp
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from tensorflow import keras
import tensorflow_addons as tfa
import keras_nlp
keras.utils.set_random_seed(42)

In [2]:
BATCH_SIZE = 64
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 512
VOCAB_SIZE = 300000

EMBED_DIM = 128
INTERMEDIATE_DIM = 512

In [3]:
train = pd.read_csv("../../dataset/goodreads_train.csv")
test = pd.read_csv("../../dataset/goodreads_test.csv")
vocabulary = np.load('../../vocabulaires/voc_without_std_word_count_5.npy', allow_pickle=True)
rating = keras.utils.to_categorical(train['rating'], num_classes=6)

"I love Stephenson - and this was another hit - absolutely loved it. The great thing about a good Stephenson book is it makes you think about the future in new ways, and this book was no exception. \n It was really two books, and I certainly didn't see the second one coming. It starts out in modern times and then someone blows up the moon. We don't have time to find out who, as within a few years the fragments of the moon cause the worst asteriod shower earth has ever seen and wipe out all life in earth. We have time to send 1,500 people up into space - and this is their story. \n The use of robots throughout the books was fascinating to me. Stephenson has clearly looked 10-20 years into our future and correctly predicted how it will go. From robot workers in space, nano-bots, nano-robot weapons, and more - we get a vivid portrayal of how robots might be a part of our future lives. \n I thought the focus on use of whip technology in space was interesting. And of course, the whole notio

In [44]:
voc = []
with open("word_piece_vocabulary") as f:
    for line in f:
        voc.append(line[:-1])

In [54]:
one_data = train['review_text'][5:15]

In [61]:
inputs = keras.Input(shape=(1,), dtype=tf.string)
vectorize_layer = keras_nlp.tokenizers.WordPieceTokenizer(voc, 312, lowercase=True,
                                                          strip_accents=True)(inputs)
print(vectorize_layer.shape)
#vectorize_layer = keras.layers.Lambda(low_dim)(vectorize_layer)
x = keras_nlp.layers.TokenAndPositionEmbedding(len(voc), 312, 300)(vectorize_layer)
print(x.shape)
model = keras.Model(inputs=[inputs], outputs=vectorize_layer)

(None, None, 312)
(None, None, 312, 300)


In [62]:
out = model.predict(one_data)
np.shape(out)



(10, 1, 312)

In [63]:
vectorize_layer = keras.layers.TextVectorization(
            standardize='lower_and_strip_punctuation',
            split='whitespace',
            output_mode='int',
            output_sequence_length=312,
            vocabulary=voc
        )(inputs)
x = keras_nlp.layers.TokenAndPositionEmbedding(len(voc), 312, 300)(vectorize_layer)
model = keras.Model(inputs=[inputs], outputs=x)

In [64]:
out = model.predict(one_data)
np.shape(out)



(10, 312, 300)

In [4]:
data = tf.data.Dataset.from_tensor_slices(train['review_text'])

In [5]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(data, 300000, vocabulary_output_file='word_piece_vocabulary',lowercase=True, strip_accents=True)

In [10]:
token = keras_nlp.tokenizers.WordPieceTokenizer('word_piece_vocabulary',300,lowercase=True, strip_accents=True)

In [11]:
token

<keras_nlp.tokenizers.word_piece_tokenizer.WordPieceTokenizer at 0x1a5a48555a0>

In [4]:
# def train_word_piece(ds, vocab_size, reserved_tokens):
#     #word_piece_ds = ds.unbatch().map(lambda x, y: x)
#     vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
#         ds.tolist(),
#         vocabulary_size=vocab_size,
#         reserved_tokens=reserved_tokens,
#     )
#     return vocab

In [5]:
# reserved_tokens = ["[PAD]", "[UNK]"]
# train_sentences = [element[0] for element in train['review_text']]
# vocab = train_word_piece(train['review_text'], VOCAB_SIZE, reserved_tokens)

In [6]:
# print("Tokens: ", vocab[100:110])

In [7]:
# tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
#     vocabulary=vocab,
#     lowercase=False,
#     sequence_length=MAX_SEQUENCE_LENGTH,
# )

In [8]:
# input_sentence_ex = train_ds.take(1).get_single_element()[0][0]
# input_tokens_ex = tokenizer(input_sentence_ex)
#
# print("Sentence: ", input_sentence_ex)
# print("Tokens: ", input_tokens_ex)
# print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))

In [9]:
# input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
inputs1 = keras.Input(shape=(1,), dtype=tf.string)
vectorize_layer = keras.layers.TextVectorization(
            standardize='lower_and_strip_punctuation',
            split='whitespace',
            output_mode='int',
            output_sequence_length=512,
            vocabulary=vocabulary
        )(inputs1)

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(vectorize_layer)

x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)


x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(6, activation="sigmoid")(x)

fnet_classifier = keras.Model(inputs1, outputs, name="fnet_classifier")

In [10]:
fnet_classifier.summary()
def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

if not os.path.exists(f"logs/fnet_classifier"):
    os.mkdir(f"logs/fnet_classifier")
if not os.path.exists(f"checkpoint/fnet_classifier"):
    os.mkdir(f"checkpoint/fnet_classifier")

chekpoint = keras.callbacks.ModelCheckpoint(f'checkpoint/fnet_classifier/', save_weights_only=True,
monitor='val_f1_score',
mode='max',
save_best_only=True)
callbacks = []
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/fnet_classifier")

fnet_classifier.compile(optimizer=keras.optimizers.Adamax(),
                           loss=keras.losses.categorical_crossentropy,
                           metrics=[keras.metrics.categorical_accuracy,
                                    tfa.metrics.F1Score(num_classes=6, average='weighted')]
                           )
fnet_classifier.fit(train['review_text'], rating, epochs=6, batch_size=500,
                                 validation_split=0.2, #class_weight=class_weight,
                                 callbacks=None)

Model: "fnet_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 512)              0         
 torization)                                                     
                                                                 
 token_and_position_embeddin  (None, 512, 128)         38465536  
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 f_net_encoder (FNetEncoder)  (None, 512, 128)         132224    
                                                                 
 f_net_encoder_1 (FNetEncode  (None, 512, 128)         132224    
 r)                                                

<keras.callbacks.History at 0x24eb86d9960>