In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Embedding, SpatialDropout1D, Bidirectional
import pandas as pd
import numpy as np
import youtokentome as yttm
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
from tensorflow.keras.mixed_precision import experimental as mixed_precision

In [3]:
physical_devices = tf.config.list_physical_devices()
print(physical_devices)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]


In [4]:
tf.config.experimental.set_memory_growth(physical_devices[2], True)

In [5]:
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

# Reading Data

In [6]:
data = pd.read_csv('train.csv').drop(columns=['uuid'])

In [7]:
data = data[data.comment_text.notnull()]

In [8]:
data = data[data.comment_text.map(len) > 1]

In [9]:
train, test = train_test_split(data, test_size=0.2)

In [10]:
# with open('train.txt', 'w') as target:
#     for row in train['comment_text']:
#         target.write(str(row).strip() + '\n')

# BPE

In [11]:
bpe_model = yttm.BPE('bpe.model')

In [12]:
train['bpe'] = train.comment_text.apply(lambda row: np.array(bpe_model.encode(row, dropout_prob=0.1)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
lens = train.bpe.apply(lambda row: len(row))

In [14]:
np.ceil(np.mean(lens) + 4 * np.std(lens))

625.0

In [15]:
train = train[train.bpe.apply(lambda row: len(row)) <= 650]

In [16]:
train_data = tf.ragged.constant(train.bpe)

In [17]:
train_data = tf.data.Dataset.from_tensor_slices((train_data, train.toxicity))

In [18]:
# train_data.apply(tf.data.experimental.bucket_by_sequence_length(
#                                 element_length_func=get_len, 
#                                 bucket_boundaries=[250], 
#                                 bucket_batch_sizes=[64, 500]))

In [19]:
test['bpe'] = test.comment_text.apply(lambda row: bpe_model.encode(row, dropout_prob=0.1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
lens = test.bpe.apply(lambda row: len(row))

In [21]:
np.ceil(np.mean(lens) + 4 * np.std(lens))

603.0

In [22]:
test = test[test.bpe.apply(lambda row: len(row)) <= 650]

In [23]:
test_data = tf.ragged.constant(test.bpe)

In [24]:
test_data = tf.data.Dataset.from_tensor_slices((test_data, test.toxicity))

# Model

In [31]:
keras.backend.clear_session()

In [32]:
model = Sequential()
model.add(Embedding(input_dim=10000,
                    output_dim=256,
                    embeddings_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02, seed=None),
                    trainable=True,
                    mask_zero=True,
                    name='Embeddings'))
model.add(SpatialDropout1D(0.1, name='Emb_drop'))
model.add(Bidirectional(LSTM(256,
                             return_sequences=True,
                             recurrent_dropout=0.5), 
                        name='Bi-LSTM_1'))
# model.add(Dropout(0.25, name='Dropout_1'))
model.add(Bidirectional(LSTM(256,
                             recurrent_dropout=0.5), 
                        name='Bi-LSTM_2'))
model.add(Dropout(0.25, name='Dropout_2'))
model.add(Dense(512, name='Dense_1'))
model.add(Dropout(0.5, name='Dropout_3'))
model.add(Dense(256, name='Dense_2'))
model.add(Dense(6, activation='relu', dtype='float32', name='Output'))

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embeddings (Embedding)       (None, None, 256)         2560000   
_________________________________________________________________
Emb_drop (SpatialDropout1D)  (None, None, 256)         0         
_________________________________________________________________
Bi-LSTM_1 (Bidirectional)    (None, None, 512)         1050624   
_________________________________________________________________
Bi-LSTM_2 (Bidirectional)    (None, 512)               1574912   
_________________________________________________________________
Dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
Dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
Dropout_3 (Dropout)          (None, 512)               0

In [35]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [38]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-3)
optimizer = mixed_precision.LossScaleOptimizer(optimizer, loss_scale='dynamic')

In [39]:
@tf.function(experimental_relax_shapes=True)
def train_step(x, y):
    with tf.GradientTape() as tape:
        predictions = model(x, training=True)
        loss = loss_object(y, predictions)
        scaled_loss = optimizer.get_scaled_loss(loss)
    scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
    gradients = optimizer.get_unscaled_gradients(scaled_gradients)
    # gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in zip(scaled_gradients, model.trainable_variables)]
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    # optimizer.apply_gradients(gradients)
    return loss, predictions

In [40]:
@tf.function(experimental_relax_shapes=True)
def test_step(x):
    predictions = model(x, training=False)
    loss = loss_object(y, predictions)
    return loss, predictions

In [60]:
## Note: Rerunning this cell uses the same model variables

# Keep results for plotting
train_loss_results = []
train_accuracy_results = []
test_loss_results = []
test_accuracy_results = []

EPOCHS = 1
BATCH_SIZE = 32

metrics_names = ['loss','acc'] 
# Progress Bars
# epochs_bar = tf.keras.utils.Progbar(EPOCHS)


for epoch in range(EPOCHS):
    train_bar = tf.keras.utils.Progbar(np.floor(int(tf.data.experimental.cardinality(train_data)/BATCH_SIZE))+1,
                                           stateful_metrics=metrics_names)
    test_bar = tf.keras.utils.Progbar(np.floor(int(tf.data.experimental.cardinality(test_data)/BATCH_SIZE))+1,
                                          stateful_metrics=metrics_names)
    train_loss_avg = tf.keras.metrics.Mean() # Avg loss
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 
    
    test_loss_avg = tf.keras.metrics.Mean()
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    
    print(f'Epoch {epoch+1}/{EPOCHS}')

    # Training loop - using batches of 32
    for batch in train_data.shuffle(buffer_size=1024).batch(BATCH_SIZE): # Dataset (features, label)
        X = batch[0].to_tensor() # RaggedTensor -> Sparse Tensor, Post Pad by the longest element
        y = batch[1]
        # Optimize the model
        # loss_value, grads = grad(model, X, y) # compute loss and grad
        # grads = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in zip(grads, model.trainable_variables)] # Clip
        # optimizer.apply_gradients(grads) # Apply grad
        loss, predictions = train_step(X, y)
        
        # Track progress
        train_loss_avg.update_state(loss)  # Add current batch loss
        
        # Compare predicted label to actual label
        # training=True is needed only if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        train_accuracy.update_state(y, predictions)
        values=[('loss', train_loss_avg.result()), ('acc', train_accuracy.result())]
        train_bar.add(1, values=values)
        
    for batch in test_data.shuffle(buffer_size=1024, reshuffle_each_iteration=True).batch(BATCH_SIZE):
        # Optimize the model
        X = batch[0].to_tensor()
        y = batch[1]
        # loss_value = loss(model, X, y, training=False)
        loss, predictions = train_step(X, y)
        
        # Track progress
        test_loss_avg.update_state(loss)  # Add current batch loss
        # Compare predicted label to actual label
        # training=True is needed only if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        test_accuracy.update_state(y, predictions)
        values=[('loss', test_loss_avg.result()), ('acc', test_accuracy.result())]
        test_bar.add(1, values=values)
    
    # End epoch
    train_loss_results.append(train_loss_avg.result())
    train_accuracy_results.append(train_accuracy.result())
    test_loss_results.append(test_loss_avg.result())
    test_accuracy_results.append(test_accuracy.result())

Epoch 1/1


In [61]:
model.save_weights('toxicity_rnn_weights.h5')

In [62]:
import json
config = model.to_json()
with open('config.json', 'w') as f:
    json.dump(config, f)