In [None]:
!pip install pandas scikit-learn matplotlib tensorflow transformers

In [2]:
from transformers import TFBertPreTrainedModel, TFBertMainLayer, BertTokenizer
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import metrics
from tqdm import tqdm
import os

In [2]:
print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [6]:
train_path = "./data/MultilabelSequenceClassification/toxic-comment-classification/train.csv.zip"
test_path = "./data/MultilabelSequenceClassification/toxic-comment-classification/test.csv.zip"
df = pd.read_csv(train_path)
df['label'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'label']].copy()
new_df.head()

Unnamed: 0,comment_text,label
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [5]:
train_size=0.9
test_data = pd.read_csv(test_path)[:1000]
train_data = new_df.sample(frac=train_size,random_state=200)[:1000]
val_data = new_df.drop(train_data.index).reset_index(drop=True)[:1000]

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("VALIDATION Dataset: {}".format(val_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

FULL Dataset: (159571, 2)
TRAIN Dataset: (1000, 2)
TEST Dataset: (1000, 2)


In [5]:
from transformers.modeling_tf_utils import get_initializer

class TFBertForMultilabelClassification(TFBertPreTrainedModel):

    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForMultilabelClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.bert = TFBertMainLayer(config, name='bert')
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name='classifier',
                                                activation='sigmoid')#--------------------- sigmoid激活函数

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        return outputs  # logits, (hidden_states), (attentions)

In [6]:
def customDataset(dataSet):
    inputs = tokenizer(dataSet['comment_text'].tolist(), max_length=max_length, padding='max_length', truncation=True,\
                   return_tensors='tf')
    if 'label' in dataSet.columns:
        label_list = dataSet['label'].values.tolist() 
    else:
        label_list = None
    result = tf.data.Dataset.from_tensor_slices((dict((k,v) for k, v in inputs.items()), label_list))          
    return result

In [7]:
model_path = '../model_dirs/bert-base-cased'
# parameters
max_length = 256
batch_size = 12
learning_rate = 1e-5
num_epochs = 5
num_classes = 6 

tokenizer = BertTokenizer.from_pretrained(model_path)

ds_train_encoded = customDataset(train_data).shuffle(100).batch(batch_size)
ds_val_encoded = customDataset(val_data).batch(batch_size)
ds_test_encoded = customDataset(test_data).batch(batch_size)

In [None]:
# model initialization
model = TFBertForMultilabelClassification.from_pretrained(model_path, num_labels=num_classes)#------------6个标签
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.BinaryCrossentropy()#-----------------------------------binary_crossentropy 损失函数
metric = tf.keras.metrics.CategoricalAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# fit model
bert_history = model.fit(ds_train_encoded, epochs= num_epochs, validation_data=ds_val_encoded)
model.evaluate(ds_val_encoded)
model.save_pretrained('./model_dirs/fine_tune_multiLable_model/')

Some layers from the model checkpoint at ./model_dirs/bert-base-uncased were not used when initializing TFBertForMultilabelClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForMultilabelClassification were not initialized from the model checkpoint at ./model_dirs/bert-base-uncased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
  374/11968 [..............................] - ETA: 32:55 - loss: 0.2235 - categorical_accuracy: 0.5209

In [None]:
# evaluate val_set
pred=model.predict(ds_val_encoded)[0]
pred.shape

In [None]:
model(list(ds_val_encoded)[0][0])

In [None]:
def validation(epoch):
    fin_targets=[]
    fin_outputs=[]
    for _, data in enumerate(ds_val_encoded, 0):
        inputs = data[0]
        targets = data[1]
        outputs = model(inputs)[0]
        fin_targets.extend(targets.numpy().tolist())
        fin_outputs.extend(outputs.numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(num_epochs)
targets = np.array(targets)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
recall_score_micro = metrics.recall_score(targets, outputs, average='micro')
recall_score_macro = metrics.recall_score(targets, outputs, average='macro')
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"recall_score (Micro) = {recall_score_micro}")
print(f"recall_score (Macro) = {recall_score_macro}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
pid = os.getpid()
!kill -9 $pid