In [1]:
from transformers import TFBertPreTrainedModel, TFBertMainLayer, BertTokenizer
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
print(tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [3]:
from transformers import TFBertPreTrainedModel, TFBertMainLayer

from transformers.modeling_tf_utils import (
    TFQuestionAnsweringLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras_serializable,
    shape_list,
)

class TFBertForMultilabelClassification(TFBertPreTrainedModel):

    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForMultilabelClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.bert = TFBertMainLayer(config, name='bert')
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name='classifier',
                                                activation='sigmoid')#--------------------- sigmoid激活函数

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        return outputs  # logits, (hidden_states), (attentions)

In [4]:
def encode_data(dataSet):
    inputs = tokenizer(dataSet['comment_text'].tolist(), max_length =128, padding='max_length', truncation=True,\
                   return_tensors='tf')
    label_list = [dataSet[col].tolist() for col in dataSet.iloc[0:1,2:].columns]
    if label_list:
        result = tf.data.Dataset.from_tensor_slices(( dict((k,v) for k, v in inputs.items()),
                                                     np.array(label_list).T))
    else:
        result = tf.data.Dataset.from_tensor_slices(( dict((k,v) for k, v in inputs.items()), ))     
                                                                
    return result

In [5]:
# parameters
train_path = "data/Toxic Comment Classification Challenge/train.csv"
test_path = "data/Toxic Comment Classification Challenge/test.csv"
model_path = 'D:/My_Document/Data_science/NLP/demo/Transformers/model_dirs/bert-base-uncased'
# parameters
max_length = 128
batch_size = 2
learning_rate = 1e-5
number_of_epochs = 2
num_classes = 6 # 类别数

In [6]:
train_val_data = pd.read_csv(train_path)
print(train_val_data.shape)
test_data = pd.read_csv(test_path)
print(test_data.shape)

(159571, 8)
(153164, 2)


In [7]:
# read data
train_val_data = pd.read_csv(train_path)[:1000]
TRAIN_VAL_RATIO = 0.9
LEN = train_val_data.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)
# train data
train_data = train_val_data[:SIZE_TRAIN]
# val data
val_data = train_val_data[SIZE_TRAIN:]
# test data
test_data = pd.read_csv(test_path)[:1000]

tokenizer = BertTokenizer.from_pretrained(model_path)
# train dataset
ds_train_encoded = encode_data(train_data).shuffle(100).batch(batch_size)
# val dataset
ds_val_encoded = encode_data(val_data).batch(batch_size)
# test dataset
ds_test_encoded = encode_data(test_data).batch(batch_size)

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# model initialization
model = TFBertForMultilabelClassification.from_pretrained(model_path, num_labels=num_classes)#------------6个标签
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.BinaryCrossentropy()#-----------------------------------binary_crossentropy 损失函数
metric = tf.keras.metrics.CategoricalAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# fit model
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_val_encoded)

Some layers from the model checkpoint at D:/My_Document/Data_science/NLP/demo/Transformers/model_dirs/bert-base-uncased were not used when initializing TFBertForMultilabelClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForMultilabelClassification were not initialized from the model checkpoint at D:/My_Document/Data_science/NLP/demo/Transformers/model_dirs/bert-base-uncased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stre

Epoch 1/2
Epoch 2/2


In [21]:
# evaluate val_set
pred=model.predict(ds_val_encoded)[0]
pred.shape

(100, 6)

[({'input_ids': <tf.Tensor: shape=(2, 128), dtype=int32, numpy=
   array([[  101,  6583,  4478,  2497,  1041, 18032,  4305,  2632,  1011,
            8038, 11493,  1045,  6592,  2025,  2000,  2224, 20423, 27427,
           19304,  1999,  1996,  3720,  6583,  4478,  2497,  1041, 18032,
            4305,  2632,  1011,  8038, 11493,  2029,  2031,  3728,  2580,
            2000,  3362,  1037,  2062,  3671,  2559,  3931,  1012,  3407,
           10086,  3436,  1012,   102,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,   

In [None]:
df_auc = measure_auc(val_data.iloc[:,2:].astype(np.float32).values,pred)
print("val set mean column auc:",df_auc)
#predict test_set

In [18]:
val_data.iloc[:,2:]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
900,0,0,0,0,0,0
901,0,0,0,0,0,0
902,0,0,0,0,0,0
903,0,0,0,0,0,0
904,0,0,0,0,0,0
...,...,...,...,...,...,...
995,0,0,0,0,0,0
996,0,0,0,0,0,0
997,0,0,0,0,0,0
998,0,0,0,0,0,0
