In [1]:
import logging
logging.basicConfig(level=logging.ERROR)
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import os
import pynvml

In [2]:
print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
def split_dataset(df):
    train_set, x = train_test_split(df, 
        stratify=df['label'],
        test_size=0.1, 
        random_state=42)
    val_set, test_set = train_test_split(x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=43)

    return train_set,val_set, test_set

In [4]:
data_path = "./data/THUCNewsChinese.txt"
# read data
df_raw = pd.read_csv(data_path,sep="\t",header=None,names=["text","label"])[:1000]    
# transfer label
df_label = pd.DataFrame({"label":["财经","房产","股票","教育","科技","社会","时政","体育","游戏","娱乐"],"y":list(range(10))})
df_raw = pd.merge(df_raw,df_label,on="label",how="left")
# split data
train_data, val_data, test_data = split_dataset(df_raw)
print("FULL Dataset: {}".format(df_raw.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))
train_data.head()

FULL Dataset: (1000, 3)
TRAIN Dataset: (900, 3)
TEST Dataset: (50, 3)


Unnamed: 0,text,label,y
173,波兰斯基要求撤销诱奸案 拍纪录片披露真相(图),娱乐,9
270,女子不服判决带棺材到法院闹事,社会,5
824,松下 DMC-GF3电脑展现场限售仅4880元,科技,4
621,江西08年自考女生首次突破一半,教育,3
74,德媒披露鲁能引援关键人物 是他力荐德甲亚洲强人,体育,7


In [5]:
def customDataset(dataSet):
    inputs = tokenizer(dataSet['text'].tolist(), max_length=max_length, padding='max_length', truncation=True,\
                   return_tensors='tf')
    if 'y' in dataSet.columns:
        label_list = dataSet['y'].values.tolist() 
    else:
        label_list = None
    result = tf.data.Dataset.from_tensor_slices((dict((k,v) for k, v in inputs.items()), label_list))          
    return result

In [6]:
%%time

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
print(f'GPU Memory size:{meminfo.total}')

model_path = '../model_dirs/bert-base-chinese'
# parameters
max_length = 128 
batch_size = 16
if meminfo.total< 1024**3*10:
    batch_size = 6
    max_length = 32
learning_rate = 2e-5
number_of_epochs = 2
num_classes = 10 #

tokenizer = BertTokenizer.from_pretrained(model_path)
# train dataset
ds_train_encoded = customDataset(train_data).shuffle(1000).batch(batch_size)
# val dataset
ds_val_encoded = customDataset(val_data).batch(batch_size)
# test dataset
ds_test_encoded = customDataset(test_data).batch(batch_size)

GPU Memory size:4294967296
Wall time: 215 ms


In [7]:
model = TFBertForSequenceClassification.from_pretrained(model_path, num_labels=num_classes)
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# fit model
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_val_encoded)
# evaluate test_set
print("# evaluate test_set:",model.evaluate(ds_test_encoded))
## model save
model.save_pretrained('../model_dirs/fine_tune_MultiClass_model/')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ../model_dirs/bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
  2/450 [..............................] - ETA: 1:50 - loss: 2.4736 - accuracy: 0.3750   

In [None]:
model.evaluate(ds_val_encoded)

In [None]:
def y_label(logits):
    for i in range(len(logits)):
        y = tf.argmax(tf.nn.softmax(logits[i], axis=-1)).numpy()
        yield y

def validation(epoch):
    fin_targets=[]
    fin_outputs=[]
    for _, data in enumerate(ds_val_encoded, 0):
        inputs = data[0]
        targets = data[1]
        # outputs = model.predict(dict(inputs))
        outputs = model(inputs)       
        fin_targets.extend(targets.numpy())
        fin_outputs.extend(y_label(outputs.logits))
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(number_of_epochs)
accuracy = metrics.accuracy_score(targets, outputs)
recall_score_micro = metrics.recall_score(targets, outputs, average='micro')
recall_score_macro = metrics.recall_score(targets, outputs, average='macro')
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"recall_score (Micro) = {recall_score_micro}")
print(f"recall_score (Macro) = {recall_score_macro}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:

pid = os.getpid()
!kill -9 $pid