In [1]:
import os

# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModelForPreTraining
# from transformers import TFElectraForSequenceClassification, ElectraConfig
from transformers import TFAutoModelForSequenceClassification, AutoConfig
from transformers import TFTrainer, TFTrainingArguments
from transformers import training_args

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import yaml

config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)

# model = TFAutoModelForPreTraining.from_pretrained("hfl/chinese-electra-180g-small-discriminator")
# model = TFElectraForSequenceClassification.from_pretrained("hfl/chinese-electra-180g-small-discriminator")

# inputs = tokenizer("你听明白了吗", return_tensors="tf")
# outputs = model(**inputs)
# print(inputs, outputs)

In [2]:
tokenizer = AutoTokenizer.from_pretrained(config['pretrain_model_dir'])
# tokenizer = AutoTokenizer.from_pretrained('/home/jasoncheung/project/trans/trans_models/trans_datas/pretrained_models/roberta_chinese_4_512')
# load datas
path_datas = '/home/jasoncheung/project/trans/trans_datas/weibo_senti_100k.csv'
df = pd.read_csv(path_datas)
datas = df.review.tolist()
labels = df.label.tolist()

train_datas, test_datas, train_labels, test_labels = train_test_split(datas, labels, test_size=0.1)
train_datas, val_datas, train_labels, val_labels = train_test_split(train_datas, train_labels, test_size=0.1)

train_encodings = tokenizer(train_datas, return_tensors="tf", truncation=True, padding='max_length', max_length=150)
val_encodings = tokenizer(val_datas, return_tensors="tf", truncation=True, padding='max_length', max_length=150)
test_encodings = tokenizer(test_datas, return_tensors="tf", truncation=True, padding='max_length', max_length=150)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))


In [3]:
# training
training_args = TFTrainingArguments(
    do_train=config['do_train'],
    do_eval=config['do_eval'],
    output_dir=config['output_dir'],          # output directory
    num_train_epochs=config['num_train_epochs'],              # total # of training epochs
    per_device_train_batch_size=config['train_batch_size'],  # batch size per device during training
    per_device_eval_batch_size=config['eval_batch_size'],   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=config['logging_dir'],            # directory for storing logs
    save_total_limit=config['save_total_limit'],
    evaluation_strategy='steps',
    eval_steps=config['eval_steps'],
    load_best_model_at_end=True,
    disable_tqdm=False,
    max_steps=config['max_steps'],
    save_steps=config['save_steps'],

)

# load model

with training_args.strategy.scope(): 
    model = TFAutoModelForSequenceClassification.from_pretrained(config['pretrain_model_dir'], 
                                                                 num_labels=config['num_labels'], )
    # 锁住electra层，只训练输出层, 会比全训练快3倍左右
    if config['model_name'] == 'electra':
        model.electra.trainable=False
    elif config['model_name'] == 'bert':
        model.roberta.trainable=False

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,    # ensorflow_datasets training dataset
    eval_dataset=val_dataset,       # tensorflow_datasets evaluation dataset
    compute_metrics=compute_metrics,

)

Some layers from the model checkpoint at ./trans_datas/pretrained_models/electra_chinese_small were not used when initializing TFElectraForSequenceClassification: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForSequenceClassification were not initialized from the model checkpoint at ./trans_datas/pretrained_models/electra_chinese_small and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

In [4]:
import time
tic = time.time()
trainer.train()
toc = time.time()
print('time use: ', toc - tic)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


time use:  11668.286463022232


In [7]:
model.save('./results/model')
# model = TFElectraForSequenceClassification.from_pretrained(dir_path, 
#                                                            num_labels=2, 
#                                                            )
# model.load_weights('./results/model/')
# model.load_weights('./results/checkpoint/ckpt-6.index')





INFO:tensorflow:Assets written to: ./results/model/assets


INFO:tensorflow:Assets written to: ./results/model/assets


In [8]:
res = trainer.evaluate(test_dataset)









In [16]:
res

{'eval_loss': 0.6538525033504405,
 'eval_accuracy': 0.6857546542553191,
 'eval_f1': 0.679549114331723,
 'eval_precision': 0.7034567467976838,
 'eval_recall': 0.6572131147540984}

In [None]:
from transformers import pipeline

# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')
classifier('We are very happy to introduce pipeline to the transformers repository.')

In [None]:
dir_path = '/home/jasoncheung/project/trans/trans_models/sentiment_analysis/'
classifier.save_pretrained(dir_path)

In [None]:
import transformers
transformers.TextClassificationPipeline??