In [15]:
!pip install pynvml
!pip install transformers
!pip install scikit-learn
!pip install pandas

Looking in indexes: https://pypi.douban.com/simple
Looking in indexes: https://pypi.douban.com/simple
Looking in indexes: https://pypi.douban.com/simple
Looking in indexes: https://pypi.douban.com/simple


In [16]:
from transformers import TFBertPreTrainedModel, TFBertMainLayer, BertTokenizer
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import metrics
from tqdm import tqdm
import os
import pynvml

In [17]:
print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [18]:
# parameters
train_path = "../data/Toxic Comment Classification Challenge/train.csv"
test_path = "../data/Toxic Comment Classification Challenge/test.csv"
df = pd.read_csv(train_path)[:1000]
df['label'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'label']].copy()
new_df.head()

Unnamed: 0,comment_text,label
0,Explanation\r\nWhy the edits made under my use...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\r\nMore\r\nI can't make any real suggestions...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [19]:
train_size=0.9
test_data = pd.read_csv(test_path)[:1000]
train_data = new_df.sample(frac=train_size,random_state=200)
val_data = new_df.drop(train_data.index).reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

FULL Dataset: (1000, 2)
TRAIN Dataset: (900, 2)
TEST Dataset: (1000, 2)


In [20]:
from transformers.modeling_tf_utils import get_initializer

class TFBertForMultilabelClassification(TFBertPreTrainedModel):

    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForMultilabelClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.bert = TFBertMainLayer(config, name='bert')
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name='classifier',
                                                activation='sigmoid')#--------------------- sigmoid激活函数

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        return outputs  # logits, (hidden_states), (attentions)

In [21]:
def customDataset(dataSet):
    inputs = tokenizer(dataSet['comment_text'].tolist(), max_length=max_length, padding='max_length', truncation=True,\
                   return_tensors='tf')
    if 'label' in dataSet.columns:
        label_list = dataSet['label'].values.tolist() 
    else:
        label_list = None
    result = tf.data.Dataset.from_tensor_slices((dict((k,v) for k, v in inputs.items()), label_list))          
    return result

In [22]:
%%time

model_path = '../model_dirs/bert-base-uncased'

# parameters
batch_size = 8
# pynvml.nvmlInit()
# handle = pynvml.nvmlDeviceGetHandleByIndex(0)
# meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
# print(f'GPU Memory size:{meminfo.total}')
# if meminfo.total> 1024**3*10:
#     batch_size = 16 
max_length = 128   
learning_rate = 1e-5
num_epochs = 2
num_classes = 6 

tokenizer = BertTokenizer.from_pretrained(model_path)

ds_train_encoded = customDataset(train_data).shuffle(100).batch(batch_size)
ds_val_encoded = customDataset(val_data).batch(batch_size)
ds_test_encoded = customDataset(test_data).batch(batch_size)

GPU Memory size:4294967296
Wall time: 3.23 s


In [23]:
# model initialization
model = TFBertForMultilabelClassification.from_pretrained(model_path, num_labels=num_classes)#------------6个标签
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.BinaryCrossentropy()#-----------------------------------binary_crossentropy 损失函数
metric = tf.keras.metrics.CategoricalAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# fit model
bert_history = model.fit(ds_train_encoded, epochs= num_epochs, validation_data=ds_val_encoded)
model.evaluate(ds_val_encoded)
model.save_pretrained('../model_dirs/fine_tune_multiLable_model/')

Some layers from the model checkpoint at ../model_dirs/bert-base-uncased were not used when initializing TFBertForMultilabelClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMultilabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForMultilabelClassification were not initialized from the model checkpoint at ../model_dirs/bert-base-uncased and are newly initialized: ['dropout_75', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2


ResourceExhaustedError:  OOM when allocating tensor with shape[8,128,12,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node tf_bert_for_multilabel_classification_1/bert/encoder/layer_._3/attention/self/transpose_3 (defined at C:\Users\Administrator\miniconda3\envs\tfs\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:279) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_90196]

Errors may have originated from an input operation.
Input Source operations connected to node tf_bert_for_multilabel_classification_1/bert/encoder/layer_._3/attention/self/transpose_3:
 tf_bert_for_multilabel_classification_1/bert/encoder/layer_._3/attention/self/MatMul_1 (defined at C:\Users\Administrator\miniconda3\envs\tfs\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:278)

Function call stack:
train_function


In [None]:
# evaluate val_set
pred=model.predict(ds_val_encoded)[0]
pred.shape

(100, 6)

In [None]:
model(list(ds_val_encoded)[0][0])

(<tf.Tensor: shape=(8, 6), dtype=float32, numpy=
 array([[0.1683824 , 0.01843818, 0.05212034, 0.01702631, 0.06402564,
         0.0249862 ],
        [0.85332656, 0.43762764, 0.776714  , 0.4558893 , 0.7832549 ,
         0.48621866],
        [0.22616144, 0.02268914, 0.06788898, 0.021499  , 0.09119362,
         0.03336108],
        [0.23781547, 0.02197246, 0.07834281, 0.02233656, 0.0894249 ,
         0.03426007],
        [0.8534158 , 0.44260222, 0.7781072 , 0.45336506, 0.7842355 ,
         0.48268858],
        [0.1709588 , 0.01829915, 0.05629163, 0.01744768, 0.06684747,
         0.02518667],
        [0.59546566, 0.09101999, 0.3442278 , 0.08860752, 0.3571328 ,
         0.15255591],
        [0.19769281, 0.01901881, 0.0607809 , 0.01898139, 0.07185346,
         0.02900807]], dtype=float32)>,)

In [None]:
def validation(epoch):
    fin_targets=[]
    fin_outputs=[]
    for _, data in enumerate(ds_val_encoded, 0):
        inputs = data[0]
        targets = data[1]
        outputs = model(inputs)[0]
        fin_targets.extend(targets.numpy().tolist())
        fin_outputs.extend(outputs.numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(num_epochs)
targets = np.array(targets)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
recall_score_micro = metrics.recall_score(targets, outputs, average='micro')
recall_score_macro = metrics.recall_score(targets, outputs, average='macro')
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"recall_score (Micro) = {recall_score_micro}")
print(f"recall_score (Macro) = {recall_score_macro}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:

pid = os.getpid()
!kill -9 $pid

Accuracy Score = 0.81
recall_score (Micro) = 0.4523809523809524
recall_score (Macro) = 0.29814814814814816
F1 Score (Micro) = 0.5428571428571429
F1 Score (Macro) = 0.35813492063492064


'kill' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
