This notebook is used to generate the trained models for the DiscordNLP bot. 

In [1]:
import numpy as np
import pandas
import time
import re
import math
import pickle
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

Load training data. Change the filename, encoding and column titles accordingly. These are currently set for the cleaned Sentiment 140 dataset. 

In [2]:
cols = ["sentiment", "text"]
train_data = pandas.read_csv(
    "data/training_data_short.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
data_clean = train_data["text"].tolist()

This creates the volcabulary tokenizer. This is suuuuper time consuming, so it is saved out as a pickle and can be loaded back in for future runs. It may be wise to recreate the tokenizer for each dataset, but so long as the data set is English, it should be fine to reuse it.

In [3]:
try:
    with open("models/tokenizer.pickle", "rb") as f:
        tokenizer = pickle.load(f)
except:
    tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        data_clean, target_vocab_size=2**16
    )
    with open("models/tokenizer.pickle", "wb") as f:
        pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

Sentence encoding, padding, and data splitting between training and validation.

In [4]:
data_input = [tokenizer.encode(sentence) for sentence in data_clean]

# pad sentences with 0's to match the longest sentence in the data set 
max_sentence_length = max([len(sentence) for sentence in data_input])
data_input = tf.keras.preprocessing.sequence.pad_sequences(
    data_input, value=0, padding="post", maxlen=max_sentence_length
)
data_labels = train_data["sentiment"].to_numpy()
test_idx = np.random.randint(0, math.floor(len(data_clean)/2), max(math.floor(len(data_clean)/200), 100))
test_idx = np.concatenate((test_idx, test_idx+math.floor(len(data_clean)/2)))
test_inputs = data_input[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_input, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [5]:
class DCNN(tf.keras.Model):  
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size, emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [6]:
VOCAB_SIZE = tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_data["sentiment"]))
DROPOUT_RATE = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 5

Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [7]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [8]:
checkpoint_path = "chkpts"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
ckpt = tf.train.Checkpoint(Dcnn=Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)


In [9]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS,
         callbacks=[checkpoint_callback])

Epoch 1/5
Epoch 00001: saving model to chkpts
Epoch 2/5
Epoch 00002: saving model to chkpts
Epoch 3/5
Epoch 00003: saving model to chkpts
Epoch 4/5
Epoch 00004: saving model to chkpts
Epoch 5/5
Epoch 00005: saving model to chkpts


<tensorflow.python.keras.callbacks.History at 0x7f595e963128>

In [14]:
ckpt_manager.save()
print(Dcnn.get_config())

NotImplementedError: 

In [11]:
cnn_model = tf.keras.models.load_model("smol_completed_model_jupyter")

In [12]:
print(cnn_model(np.array([tokenizer.encode("You are so funny")]), training=False).numpy())

ValueError: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * Tensor("inputs:0", shape=(1, 4), dtype=int64)
    * False
  Keyword arguments: {}

Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (2 total):
    * TensorSpec(shape=(None, 47), dtype=tf.int32, name='inputs')
    * False
  Keyword arguments: {}

Option 2:
  Positional arguments (2 total):
    * TensorSpec(shape=(None, 47), dtype=tf.int32, name='inputs')
    * True
  Keyword arguments: {}

Option 3:
  Positional arguments (2 total):
    * TensorSpec(shape=(None, 47), dtype=tf.int32, name='input_1')
    * False
  Keyword arguments: {}

Option 4:
  Positional arguments (2 total):
    * TensorSpec(shape=(None, 47), dtype=tf.int32, name='input_1')
    * True
  Keyword arguments: {}