In [2]:
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
def split_dataset(df):
    train_set, x = train_test_split(df, 
        stratify=df['label'],
        test_size=0.1, 
        random_state=42)
    val_set, test_set = train_test_split(x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=43)

    return train_set,val_set, test_set

In [6]:
data_path = "./data/THUCNewsChinese.txt"
# read data
df_raw = pd.read_csv(data_path,sep="\t",header=None,names=["text","label"])    
# transfer label
df_label = pd.DataFrame({"label":["财经","房产","股票","教育","科技","社会","时政","体育","游戏","娱乐"],"y":list(range(10))})
df_raw = pd.merge(df_raw,df_label,on="label",how="left")
# split data
train_data, val_data, test_data = split_dataset(df_raw)

In [7]:
s = next(val_data.iterrows())
print(s)
review = s[1]['text']
review 

(67386, text     房山顺成嘉苑在售91平米通透2居均价8200元(图)
label                            房产
y                                 1
Name: 67386, dtype: object)


'房山顺成嘉苑在售91平米通透2居均价8200元(图)'

In [8]:
def convert_example_to_feature(review):
  
  # combine step for tokenization, WordPiece vector mapping, adding special tokens as well as truncating reviews longer than the max length
	return tokenizer.encode_plus(review, 
	            add_special_tokens = True, # add [CLS], [SEP]
	            max_length = 32, # max length of the text that can go to BERT
	            pad_to_max_length = True, # add [PAD] tokens
	            return_attention_mask = True, # add attention mask to not focus on pad tokens
		    truncation=True
	          )

In [218]:
model_path = "D:/My_Document/Data_science/NLP/demo/Transformers/model_dirs/bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_path)
input_ids = convert_example_to_feature(review)
input_ids

{'input_ids': [101, 2791, 2255, 7556, 2768, 1649, 5723, 1762, 1545, 8440, 2398, 5101, 6858, 6851, 123, 2233, 1772, 817, 10398, 8129, 1039, 113, 1745, 114, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [217]:
tokenizer(review, max_length = 32, pad_to_max_length=True, truncation=True)

{'input_ids': [101, 2791, 2255, 7556, 2768, 1649, 5723, 1762, 1545, 8440, 2398, 5101, 6858, 6851, 123, 2233, 1772, 817, 10398, 8129, 1039, 113, 1745, 114, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [216]:
tokenizer(review, max_length = 32,  padding = 'max_length', truncation=True, return_tensors='tf')

{'input_ids': <tf.Tensor: shape=(1, 32), dtype=int32, numpy=
array([[  101,  2791,  2255,  7556,  2768,  1649,  5723,  1762,  1545,
         8440,  2398,  5101,  6858,  6851,   123,  2233,  1772,   817,
        10398,  8129,  1039,   113,  1745,   114,   102,     0,     0,
            0,     0,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(1, 32), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 32), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])>}

In [327]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)
    
    for index, row in ds.iterrows():
        review = row["text"]
        label = row["y"]
#         bert_input = convert_example_to_feature(review)
        bert_input = tokenizer(review, max_length = 32,  padding = 'max_length', truncation=True)
  
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append(label)
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


In [328]:
batch_size = 10
# tokenizer = BertTokenizer.from_pretrained(model_path)
# train dataset
# ds_train_encoded = encode_examples(train_data).shuffle(10000).batch(batch_size)
# # val dataset
ds_val_encoded = encode_examples(val_data).batch(batch_size)
# # test dataset
# ds_test_encoded = encode_examples(test_data).batch(batch_size)

In [232]:
model = TFBertForSequenceClassification.from_pretrained('fine_tune_model/',num_labels=10)

Some layers from the model checkpoint at fine_tune_model/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at fine_tune_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [329]:
val_data[:10]

Unnamed: 0,text,label,y
67386,房山顺成嘉苑在售91平米通透2居均价8200元(图),房产,1
150820,国内首只对冲基金成立一个月获益近2%,财经,0
74822,TD数字无绳电话年内上市：欲成杀手级产品,科技,4
146018,《无限传说》公布新登场角色画面欣赏,游戏,8
122072,坦普：油价走高趋势不改,财经,0
29755,羚牛争偶战败独自下山滋事 闯进农家院3人被困,社会,5
52606,尼日尔总统任命加马蒂耶为总理,时政,6
128983,《QQ三国》名人堂 低调也是炫耀�,游戏,8
938,中信银行与支付宝推快捷支付业务,科技,4
21446,女性网上创业平台推出,科技,4


In [330]:
val_data[:10]['text'].tolist()

['房山顺成嘉苑在售91平米通透2居均价8200元(图)',
 '国内首只对冲基金成立一个月获益近2%',
 'TD数字无绳电话年内上市：欲成杀手级产品',
 '《无限传说》公布新登场角色画面欣赏',
 '坦普：油价走高趋势不改',
 '羚牛争偶战败独自下山滋事 闯进农家院3人被困',
 '尼日尔总统任命加马蒂耶为总理',
 '《QQ三国》名人堂 低调也是炫耀�',
 '中信银行与支付宝推快捷支付业务',
 '女性网上创业平台推出']

In [331]:
input_id = (tokenizer(val_data[:10]['text'].tolist(), max_length = 32,  padding = 'max_length', truncation=True, return_tensors='tf'))

In [332]:
input_id['input_ids'][0]

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([  101,  2791,  2255,  7556,  2768,  1649,  5723,  1762,  1545,
        8440,  2398,  5101,  6858,  6851,   123,  2233,  1772,   817,
       10398,  8129,  1039,   113,  1745,   114,   102,     0,     0,
           0,     0,     0,     0,     0])>

In [324]:
''.join(tokenizer.convert_ids_to_tokens(list(ds_val_encoded)[0][0]['input_ids'][0]))

'[CLS]房山顺成嘉苑在售91平米通透2居均价820##0元(图)[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [325]:
''.join(tokenizer.convert_ids_to_tokens(input_id['input_ids'][0]))

'[CLS]房山顺成嘉苑在售91平米通透2居均价820##0元(图)[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [333]:
outputs = model(input_id)
logits = outputs.logits
logits[:5]

<tf.Tensor: shape=(5, 10), dtype=float32, numpy=
array([[-0.06825325, 10.845659  , -1.4365877 , -1.1311263 , -2.2768779 ,
         1.2029951 , -1.2820653 , -1.1390705 , -1.3907384 , -4.198648  ],
       [ 8.0618515 ,  0.9518317 ,  2.5615802 , -1.6324717 , -0.25369108,
        -1.3738074 , -1.5680287 , -2.930448  , -4.0349464 , -4.084395  ],
       [-0.9568867 , -2.1097174 ,  0.18050869, -0.76184905,  6.479282  ,
        -1.5816749 , -1.2020916 , -1.1032    , -0.3342142 , -2.411711  ],
       [-2.7709982 ,  0.8198834 , -1.387241  , -2.170227  ,  1.0532678 ,
        -3.150462  , -2.0198505 , -1.335979  , 10.740458  , -0.4670856 ],
       [ 7.5408697 ,  1.4736296 ,  4.245677  , -2.4453602 , -0.9757267 ,
        -2.1691797 , -1.2424012 , -2.7686968 , -3.8558693 , -3.9255178 ]],
      dtype=float32)>

In [334]:
def y_lable(logits):
    for i in range(len(logits)):
        y = tf.argmax(tf.nn.softmax(logits[i], axis=-1)).numpy()
        yield y
list(y_lable(logits))

[1, 0, 4, 8, 0, 5, 6, 8, 4, 4]

In [235]:
type(input_id)

transformers.tokenization_utils_base.BatchEncoding

In [293]:
type(list(ds_val_encoded)[0][0])

dict

In [236]:
type(dict(input_id))

dict

In [295]:
pred=model.predict(dict(input_id))[0]
list(y_lable(pred))

[1, 0, 4, 8, 0, 5, 6, 8, 4, 4]

In [323]:
outputs = model(list(ds_val_encoded)[0][0])
logits = outputs.logits
list(y_lable(logits))

[1, 0, 4, 8, 0, 5, 6, 8, 4, 4]

In [340]:
list(ds_val_encoded)[0][1].numpy().tolist()

[1, 0, 4, 8, 0, 5, 6, 8, 4, 4]