In [1]:
# !pip install pandas scikit-learn matplotlib tensorflow transformers

In [2]:
import os
from ast import literal_eval
from typing import Dict, Optional, Tuple, Union
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import metrics
from transformers import TFBertPreTrainedModel, TFBertMainLayer, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## English

In [4]:
train_path = "./data/MultilabelSequenceClassification/toxic-comment-classification/train.csv.zip"
test_path = "./data/MultilabelSequenceClassification/toxic-comment-classification/test.csv.zip"

# df = pd.read_csv(train_path)
# df['label'] = df[df.columns[2:]].values.tolist()
# new_df = df[['comment_text', 'label']].copy()
# new_df.rename(columns={'comment_text':'content'}, inplace=True)
# train_size=0.9
# test_data = pd.read_csv(test_path)[:1000]
# train_data = new_df.sample(frac=train_size,random_state=200)[:1000]
# val_data = new_df.drop(train_data.index).reset_index(drop=True)[:1000]

# print("FULL Dataset: {}".format(new_df.shape))
# print("TRAIN Dataset: {}".format(train_data.shape))
# print("VALIDATION Dataset: {}".format(val_data.shape))
# print("TEST Dataset: {}".format(test_data.shape))

### Chineses

In [5]:
def load_dataset(train_path, train_size=0.9):
    df = pd.read_csv(train_path)
    new_df = df[['content', 'label_ids']].copy()
    new_df.rename(columns={'label_ids':'label'}, inplace=True)
    new_df.label = new_df.label.apply(literal_eval)
    train_data = new_df.sample(frac=train_size, random_state=200)
    val_data = new_df.drop(train_data.index)

    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    
    print(f"FULL Dataset: {new_df.shape}")
    print(f"TRAIN Dataset: {train_data.shape}")
    print(f"VALIDATION Dataset: {val_data.shape}")

    return train_data, val_data

train_path = "./data/MultilabelSequenceClassification/chinese_dataset/train_dataset.zip"
train_data, val_data = load_dataset(train_path=train_path)
train_data.sample(5)

FULL Dataset: (100, 2)
TRAIN Dataset: (90, 2)
VALIDATION Dataset: (10, 2)


Unnamed: 0,content,label
60,好消息！国足对手叙利亚面临换帅足协高层已经集体辞职了,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,随着里贝里、罗本赛季结束后离开，长期为球迷奉献精彩比赛的拜仁“罗贝里”黄金组合也将结束历史使命。,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17,继王晨艺退赛后创2陆思恒又被爆出黑料！网友“又挡了谁的路”？,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
43,9月11日，第十一届全国少数民族传统体育运动会女子传统拳术三类决赛展开较量。来自山东武术院的...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
62,蔚来召回ES8，而特斯拉只有傲慢，网友：欺负我们没见过世面,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [6]:
from typing import Dict, Optional, Tuple, Union
from transformers.modeling_tf_utils import get_initializer, TFModelInputType, TFSequenceClassificationLoss
from transformers.modeling_tf_outputs import TFSequenceClassifierOutput
from transformers.tf_utils import shape_list

class TFSequenceClassificationLoss:
    """
    Loss function suitable for sequence classification.
    """

    def hf_compute_loss(self, labels, logits):
        if len(shape_list(logits)) == 1 or shape_list(logits)[1] == 1:
            loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
        else:
            loss_fn = tf.keras.losses.BinaryCrossentropy(
                from_logits=True, reduction=tf.keras.losses.Reduction.NONE
            )

        return loss_fn(labels, logits)

class TFBertForMultilabelClassification(TFBertPreTrainedModel, TFSequenceClassificationLoss):

    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForMultilabelClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.bert = TFBertMainLayer(config, name='bert')
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name='classifier')

    def call(
        self,
        input_ids: Optional[TFModelInputType] = None,
        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
        training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(inputs=pooled_output, training=training)
        logits = self.classifier(inputs=pooled_output)
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # if not return_dict:
        #     output = (logits,) + outputs[2:]
        #     return ((loss,) + output) if loss is not None else output

        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [7]:
def customDataset(dataset, tokenizer, max_length):
    inputs = tokenizer(dataset['content'].tolist(), max_length=max_length, padding='max_length', truncation=True,\
                   return_tensors='tf')
    if 'label' in dataset.columns:
        label_list = dataset['label'].values.tolist() 
    else:
        label_list = None
    result = tf.data.Dataset.from_tensor_slices((dict((k,v) for k, v in inputs.items()), label_list))          
    return result

In [8]:
# English
# model_path = '../models/bert-base-uncased'
# Chinese
model_path = '../models/bert-base-chinese'
# parameters
max_length = 128
batch_size = 8
learning_rate = 1e-5
num_epochs = 5
# num_classes = 6 
num_classes = 65

tokenizer = BertTokenizer.from_pretrained(model_path)

ds_train_encoded = customDataset(train_data, tokenizer, max_length).shuffle(100).batch(batch_size)
ds_val_encoded = customDataset(val_data, tokenizer, max_length).batch(batch_size)
# ds_test_encoded = customDataset(test_data, tokenizer, max_length).batch(batch_size)

In [9]:
# model initialization
model = TFBertForMultilabelClassification.from_pretrained(model_path, num_labels=num_classes)#------------6个标签

Some layers from the model checkpoint at ../models/bert-base-chinese were not used when initializing TFBertForMultilabelClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForMultilabelClassification were not initialized from the model checkpoint at ../models/bert-base-chinese and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# for i, (data, label) in enumerate(ds_val_encoded):
#     print(data, label)
#     if i == 0:
#         break
# output = model(**data,  labels=label)
# output

In [11]:
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)# binary_crossentropy 损失函数
metric = tf.keras.metrics.CategoricalAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# fit model
bert_history = model.fit(ds_train_encoded, epochs= num_epochs, validation_data=ds_val_encoded)
model.evaluate(ds_val_encoded)
model.save_pretrained('../models/fine_tune_multiLable_model/')

Epoch 1/5
1/9 [==>...........................] - ETA: 2:53 - loss: 0.6966 - categorical_accuracy: 0.0000e+00

ResourceExhaustedError: Graph execution error:

Detected at node 'tf_bert_for_multilabel_classification/bert/encoder/layer_._11/intermediate/Gelu/mul_1' defined at (most recent call last):
    File "d:\Program\minicoda3\envs\py3.8\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "d:\Program\minicoda3\envs\py3.8\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\traitlets\config\application.py", line 976, in launch_instance
      app.start()
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "d:\Program\minicoda3\envs\py3.8\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "d:\Program\minicoda3\envs\py3.8\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "d:\Program\minicoda3\envs\py3.8\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\IPython\core\interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\IPython\core\interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\IPython\core\interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Nan\AppData\Local\Temp\ipykernel_15704\1921213348.py", line 8, in <cell line: 8>
      bert_history = model.fit(ds_train_encoded, epochs= num_epochs, validation_data=ds_val_encoded)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\modeling_tf_utils.py", line 1398, in train_step
      y_pred = self(x, training=True)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Nan\AppData\Local\Temp\ipykernel_15704\1655751431.py", line 52, in call
      outputs = self.bert(
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\modeling_tf_utils.py", line 753, in run_call_with_unpacked_inputs
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 863, in call
      encoder_outputs = self.encoder(
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 548, in call
      for i, layer_module in enumerate(self.layer):
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 554, in call
      layer_outputs = layer_module(
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 510, in call
      intermediate_output = self.intermediate(hidden_states=attention_output)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 414, in call
      hidden_states = self.intermediate_act_fn(hidden_states)
    File "d:\Program\minicoda3\envs\py3.8\lib\site-packages\keras\activations.py", line 351, in gelu
      return tf.nn.gelu(x, approximate)
Node: 'tf_bert_for_multilabel_classification/bert/encoder/layer_._11/intermediate/Gelu/mul_1'
failed to allocate memory
	 [[{{node tf_bert_for_multilabel_classification/bert/encoder/layer_._11/intermediate/Gelu/mul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_31233]

In [None]:
# evaluate val_set
# pred=model.predict(ds_val_encoded)
# pred

In [None]:
def validation(epoch):
    fin_targets=[]
    fin_outputs=[]
    for _, data in enumerate(ds_val_encoded, 0):
        inputs = data[0]
        targets = data[1]
        outputs = model(inputs)[0]
        fin_targets.extend(targets.numpy().tolist())
        fin_outputs.extend(outputs.numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(num_epochs)
targets = np.array(targets)
outputs = np.array(outputs) >= 0.5

accuracy = metrics.accuracy_score(targets, outputs)
precision_micro = metrics.precision_score(targets, outputs, average='micro')
precision_macro = metrics.precision_score(targets, outputs, average='macro')
precision_samples = metrics.precision_score(targets, outputs, average='samples')
recall_score_micro = metrics.recall_score(targets, outputs, average='micro')
recall_score_macro = metrics.recall_score(targets, outputs, average='macro')
recall_score_samples = metrics.recall_score(targets, outputs, average='samples')
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
f1_score_samples = metrics.f1_score(targets, outputs, average='samples')

print(f"Accuracy Score = {accuracy}")
print(f"precision (Micro) = {precision_micro}")
print(f"precision (Macro) = {precision_macro}")
print(f"precision (samples) = {precision_samples}")
print(f"recall_score (Micro) = {recall_score_micro}")
print(f"recall_score (Macro) = {recall_score_macro}")
print(f"recall_score (samples) = {recall_score_samples}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print(f"F1 Score (samples) = {f1_score_samples}")


Accuracy Score = 0.0
precision (Micro) = 0.0
precision (Macro) = 0.0
precision (samples) = 0.0
recall_score (Micro) = 0.0
recall_score (Macro) = 0.0
recall_score (samples) = 0.0
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0
F1 Score (samples) = 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [None]:
pid = os.getpid()
!kill -9 $pid

'kill' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
