In [1]:
import zipfile

import numpy as np
import pandas as pd
import tensorflow as tf

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    TFAutoModelForSequenceClassification,
)
from tensorflow.keras.optimizers import Adam

2022-12-20 18:46:26.802095: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with zipfile.ZipFile('data/jigsaw-toxic-comment-classification-challenge.zip') as zipf:
    print(zipf.namelist())

['sample_submission.csv.zip', 'test.csv.zip', 'test_labels.csv.zip', 'train.csv.zip']


In [3]:
with zipfile.ZipFile('data/jigsaw-toxic-comment-classification-challenge.zip') as datazip:
    with zipfile.ZipFile(datazip.open('train.csv.zip')) as trainzip:
        with trainzip.open('train.csv') as traincsv:
            traindf = pd.read_csv(traincsv)

In [4]:
traindf.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [5]:
traindf.set_index('id', inplace=True)

In [6]:
featuredf = traindf.drop(columns='comment_text')

positive_weights = (1 / featuredf.sum()) * (len(featuredf) / len(featuredf.columns))
negative_weights = (1 / (len(featuredf) - featuredf.sum())) * (len(featuredf) / len(featuredf.columns))

print(f'positive:\n{positive_weights}\nnegative:\n{negative_weights}')

positive:
toxic             1.738928
severe_toxic     16.674086
obscene           3.147730
threat           55.638424
insult            3.376307
identity_hate    18.928944
dtype: float64
negative:
toxic            0.184334
severe_toxic     0.168349
obscene          0.175985
threat           0.167167
insult           0.175321
identity_hate    0.168147
dtype: float64


For brevity, let ``x = logits``, and ``z = labels``. Let
``a = positive_weights`` and ``b = negative_weights``, where
the weights will cause the model to "pay extra attention" (``> 1``) or
"less attention" (``< 1``) to examples from each class.

The weighted logistic loss is:

```
  a * z * -log(sigmoid(x)) + b * (1 - z) * -log(1 - sigmoid(x))
= a * z * -log(1 / (1 + exp(-x))) + b * (1 - z) * -log(exp(-x) / (1 + exp(-x)))
= a * z * log(1 + exp(-x)) + b * (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
= a * z * log(1 + exp(-x)) + b * (1 - z) * (x + log(1 + exp(-x))
```

Rearranging to group ``log`` terms:

```
  a * z * log(1 + exp(-x)) + b * (1 - z) * (x + log(1 + exp(-x))
= b * (1 - z) * x + (a * z + b * (1 - z)) * log(1 + exp(-x))
```

To simplify expressions, let ``A = a * z`` and ``B = b * (1 - z)``:

```
  b * (1 - z) * x + (a * z + b * (1 - z)) * log(1 + exp(-x))
= B * x + (A + B) * log(1 + exp(-x))
```

For ``x < 0``, to avoid overflow in ``exp(-x)``, we  can reformulate the
above to:

```
  B * x + (A + B) * log(1 + exp(-x))
= B * x + (A + B) * log(exp(-x) * (1 + exp(x)))
= B * x + (A + B) * (-x + log(1 + exp(x)))
= -A * x + (A + B) * log(1 + exp(x))
```

So our piecewise stable definition:

```
=  B * x + (A + B) * log(1 + exp(-x))  # when x > 0
= -A * x + (A + B) * log(1 + exp( x))  # when x < 0
```

Combining the two:

```
= -A * x + (A + B) * max(x, 0) + (A + B) * log(1 + exp(-abs(x)))
```

In [7]:
def loss_fn(
    labels: tf.Tensor,
    logits: tf.Tensor,
    positive_weights: np.ndarray = positive_weights,
    negative_weights: np.ndarray = negative_weights,
):
    """Class-weighted sigmoid cross entropy given `logits`.

    Args:
        labels: A :class:`tf.Tensor` of the same type and shape as `logits`.
        logits: A :class:`tf.Tensor` of type :class:`tf.float32` or
            :class:`tf.float64`. Any real number.
        positive_weights: A :class:`np.ndarray` whose size matces the ``-1``
            dimension of `logits`. The positive weights for each represented
            class label.
        negative_weights: A :class:`np.ndarray` whose size matces the ``-1``
            dimension of `logits`. The negative weights for each represented
            class label.

    Returns:
        The class-weighted sigmoid cross entropy loss average over classes.         
    """

    labels = tf.cast(labels, logits.dtype)
    pos = tf.cast(positive_weights, logits.dtype)
    neg = tf.cast(negative_weights, logits.dtype)

    zeros = tf.zeros_like(logits)
    cond = (logits >= zeros)
    relu_logits = tf.where(cond, logits, zeros)
    neg_abs_logits = tf.where(cond, -logits, logits)
    return tf.reduce_mean((
            -pos * labels * logits +
            (pos * labels + neg * (1 - labels)) * relu_logits +
            (pos * labels + neg * (1 - labels)) * tf.math.log1p(tf.math.exp(neg_abs_logits))
        ),
        axis=-1,
    )

In [8]:
logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels.astype(logits.dtype), logits)
ce_mean = tf.reduce_mean(cross_entropy, -1)
ce_mean

2022-12-20 18:46:30.398806: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-20 18:46:30.403772: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-20 18:46:30.404020: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-20 18:46:30.405147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, 

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.3132617, 0.9514133], dtype=float32)>

In [9]:
default_weights = np.array([1., 1.])
loss_fn(labels, logits, default_weights, default_weights)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.3132617, 0.9514133], dtype=float32)>

In [10]:
trainds = Dataset.from_pandas(pd.DataFrame(data={
    'text': traindf.comment_text,
    'label': traindf.drop(columns='comment_text').values.tolist(),
}))

In [11]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data['text'], padding='max_length', truncation=True)

trainds = trainds.map(tokenize_dataset)

100%|██████████| 159571/159571 [00:55<00:00, 2862.90ex/s]


In [12]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(traindf.columns) - 1,
    problem_type='multi_label_classification',
)

2022-12-20 18:47:29.329069: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpo

In [13]:
tf_trainds = model.prepare_tf_dataset(trainds, batch_size=4, shuffle=True, tokenizer=tokenizer)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
model.compile(optimizer=Adam(3e-5), loss=loss_fn)

In [15]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  4614      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,958,086
Trainable params: 66,958,086
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(tf_trainds, epochs=3)

Epoch 1/3
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2022-12-20 18:47:38.755871: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x4934c570 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-12-20 18:47:38.755918: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2022-12-20 18:47:38.761973: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-12-20 18:47:38.864769: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7faab803c1c0>

In [17]:
model.save_pretrained('toxicity-4batch-3epoch')