In [5]:
# !pip install pynvml
# !pip install transformers
# !pip install scikit-learn
# !pip install pandas

Looking in indexes: https://pypi.douban.com/simple
Looking in indexes: https://pypi.douban.com/simple
Looking in indexes: https://pypi.douban.com/simple
Looking in indexes: https://pypi.douban.com/simple
Collecting pandas
  Using cached https://pypi.doubanio.com/packages/51/51/48f3fc47c4e2144da2806dfb6629c4dd1fa3d5a143f9652b141e979a8ca9/pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
Collecting pytz>=2017.3
  Downloading https://pypi.doubanio.com/packages/70/94/784178ca5dd892a98f113cdd923372024dc04b8d40abe77ca76b5fb90ca6/pytz-2021.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 12.8 MB/s 
Installing collected packages: pytz, pandas
Successfully installed pandas-1.2.4 pytz-2021.1


In [7]:
import logging
import os
logging.basicConfig(level=logging.ERROR)
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pynvml

In [8]:
print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [9]:
def split_dataset(df):
    train_set, x = train_test_split(df, 
        stratify=df['label'],
        test_size=0.1, 
        random_state=42)
    val_set, test_set = train_test_split(x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=43)

    return train_set,val_set, test_set

In [18]:
data_path = "../data/THUCNewsChinese.txt"
# read data
df_raw = pd.read_csv(data_path,sep="\t",header=None,names=["text","label"])
# transfer label
df_label = pd.DataFrame({"label":["财经","房产","股票","教育","科技","社会","时政","体育","游戏","娱乐"],"y":list(range(10))})
df_raw = pd.merge(df_raw,df_label,on="label",how="left")
# split data
train_data, val_data, test_data = split_dataset(df_raw)
print("FULL Dataset: {}".format(df_raw.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("Validation Dataset: {}".format(val_data.shape))
print("TEST Dataset: {}".format(test_data.shape))
train_data.head()

FULL Dataset: (200000, 3)
TRAIN Dataset: (180000, 3)
Validation Dataset: (10000, 3)
TEST Dataset: (10000, 3)


Unnamed: 0,text,label,y
57544,男子长时间上网昏倒后被转入重症监护室,社会,5
28829,《生化尖兵》恶搞广告片：生化经理人,游戏,8
128336,《武林OL》“决战光明顶”今日上线,游戏,8
147143,美证交会调查华尔街ETF内幕交易,股票,2
54266,美国信用评级下降 美债反受青睐,股票,2


In [10]:
def customDataset(dataSet):
    inputs = tokenizer(dataSet['text'].tolist(), max_length=max_length, padding='max_length', truncation=True,\
                   return_tensors='tf')
    if 'y' in dataSet.columns:
        label_list = dataSet['y'].values.tolist() 
    else:
        label_list = None
    result = tf.data.Dataset.from_tensor_slices((dict((k,v) for k, v in inputs.items()), label_list))          
    return result

In [20]:
model_path = '../model_dirs/bert-base-chinese'
# parameters
batch_size = 8
# pynvml.nvmlInit()
# handle = pynvml.nvmlDeviceGetHandleByIndex(0)
# meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
# print(f'GPU Memory size:{meminfo.total}')
# if meminfo.total> 1024**3*10:
#     batch_size = 16 

max_length = 128
learning_rate = 2e-5
number_of_epochs = 5
num_classes = 10 #

tokenizer = BertTokenizer.from_pretrained(model_path)
# train dataset
ds_train_encoded = customDataset(train_data).shuffle(10000).batch(batch_size)
# val dataset
ds_val_encoded = customDataset(val_data).batch(batch_size)
# test dataset
ds_test_encoded = customDataset(test_data).batch(batch_size)

In [7]:
model = TFBertForSequenceClassification.from_pretrained(model_path, num_labels=num_classes)
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# fit model
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_val_encoded)
# evaluate test_set
print("# evaluate test_set:",model.evaluate(ds_test_encoded))
## model save
model.save_pretrained('./model_dirs/fine_tune_MultiClass_model/')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ./model_dirs/bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
# evaluate test_set: [0.2690059244632721, 0.9406999945640564]


In [8]:
model.evaluate(ds_val_encoded)



[0.30297088623046875, 0.9333000183105469]

In [None]:
def y_label(logits):
    for i in range(len(logits)):
        y = tf.argmax(tf.nn.softmax(logits[i], axis=-1)).numpy()
        yield y

def validation(epoch):
    fin_targets=[]
    fin_outputs=[]
    for _, data in enumerate(ds_val_encoded, 0):
        inputs = data[0]
        targets = data[1]
        # outputs = model.predict(dict(inputs))
        outputs = model(inputs)       
        fin_targets.extend(targets.numpy())
        fin_outputs.extend(y_label(outputs.logits))
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(number_of_epochs)
accuracy = metrics.accuracy_score(targets, outputs)
recall_score_micro = metrics.recall_score(targets, outputs, average='micro')
recall_score_macro = metrics.recall_score(targets, outputs, average='macro')
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"recall_score (Micro) = {recall_score_micro}")
print(f"recall_score (Macro) = {recall_score_macro}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9331
recall_score (Micro) = 0.9331
recall_score (Macro) = 0.9330999999999999
F1 Score (Micro) = 0.9331
F1 Score (Macro) = 0.9332044368380819


In [None]:
pid = os.getpid()
!kill -9 $pid