Skip to content

Commit

Permalink
Dygraph fix4 (PaddlePaddle#464)
Browse files Browse the repository at this point in the history
* upgrade version no

* seq2seq + length penalty

* upgrade to paddle 1.8

* fix readme

* update seq2seq to 1.8

* fix seq2seq beam-search

* + aisudio tutorial: loading old-styled checkpoint

* add mising seq2seq eval file for cnndm

* fix seq2seq decode post process

* + handlers
  • Loading branch information
Meiyim committed May 27, 2020
1 parent 655cf25 commit 241c028
Show file tree
Hide file tree
Showing 15 changed files with 112 additions and 57 deletions.
9 changes: 5 additions & 4 deletions README.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,11 @@ Don't have GPU? try ERNIE in [AIStudio](https://aistudio.baidu.com/aistudio/inde
(please choose the latest version and apply for a GPU environment)

1. [ERNIE for beginners](https://aistudio.baidu.com/studio/edu/group/quick/join/314947)
1. [Sementic Analysis](https://aistudio.baidu.com/aistudio/projectdetail/427482)
2. [Cloze Test](https://aistudio.baidu.com/aistudio/projectdetail/433491)
3. [Knowledge Distillation](https://aistudio.baidu.com/aistudio/projectdetail/439460)
4. [Ask Ernie](https://aistudio.baidu.com/aistudio/projectdetail/456443)
1. [Sementic analysis](https://aistudio.baidu.com/aistudio/projectdetail/427482)
2. [Cloze test](https://aistudio.baidu.com/aistudio/projectdetail/433491)
3. [Knowledge distillation](https://aistudio.baidu.com/aistudio/projectdetail/439460)
4. [Ask ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/456443)
5. [Loading old-styled checkpoint](https://aistudio.baidu.com/aistudio/projectdetail/493415)

# Setup

Expand Down
3 changes: 2 additions & 1 deletion README.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ print(pooled.numpy()) # convert results to numpy
2. [完形填空](https://aistudio.baidu.com/aistudio/projectdetail/433491)
3. [知识蒸馏](https://aistudio.baidu.com/aistudio/projectdetail/439460)
4. [万事不决问ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/456443)
5. [加载并读取老式checkpoint](https://aistudio.baidu.com/aistudio/projectdetail/493415)

# 安装

Expand Down Expand Up @@ -230,7 +231,7 @@ sids = np.expand_dims(sids, 0)
result = client(ids, sids)
```

你也可从[此处]((https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz.)下载一个预先制作好的ernie-1.0 base模型的 `inference_model`.
你也可从[此处](https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz.)下载一个预先制作好的ernie-1.0 base模型的 `inference_model`.
该模型没有经过finetune,一般可以用做上层模型结构的 feature-base finetune或者做为一个文本特征抽取器。
因为该模行由老版API 产出,在进行客户端请求时需要在输入tensor后面追加一个维度:

Expand Down
1 change: 0 additions & 1 deletion demo/finetune_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@


log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)

def model_fn(features, mode, params, run_config):
Expand Down
9 changes: 4 additions & 5 deletions demo/finetune_classifier_dygraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import propeller.paddle as propeller

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)


Expand Down Expand Up @@ -104,20 +103,20 @@ def map_fn(seg_a, seg_b, label):
with FD.guard(place):
model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='')

g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
if args.use_lr_decay:
opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd)
opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)
else:
opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd)
opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)

g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
for epoch in range(args.epoch):
for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
ids, sids, label = d
loss, _ = model(ids, sids, labels=label)
loss.backward()
if step % 10 == 0:
log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr()))
opt.minimize(loss, grad_clip=g_clip)
opt.minimize(loss)
model.clear_gradients()
if step % 100 == 0:
acc = []
Expand Down
14 changes: 10 additions & 4 deletions demo/finetune_classifier_dygraph_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import propeller.paddle as propeller

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)


Expand Down Expand Up @@ -102,15 +101,22 @@ def map_fn(seg_a, seg_b, label):
model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='')
model = FD.parallel.DataParallel(model, ctx)

opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd)
g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
opt = AdamW(learning_rate=LinearDecay(
args.lr,
int(args.warmup_proportion * args.max_steps),
args.max_steps),
parameter_list=model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)

for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
ids, sids, label = d
loss, _ = model(ids, sids, labels=label)
scaled_loss = model.scale_loss(loss)
scaled_loss.backward()
model.apply_collective_grads()
opt.minimize(scaled_loss, grad_clip=g_clip)
opt.minimize(scaled_loss)
model.clear_gradients()
if step % 10 == 0:
log.debug('train loss %.5f, lr %.e3' % (loss.numpy(), opt.current_step_lr()))
Expand Down
3 changes: 1 addition & 2 deletions demo/finetune_mrc_dygraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
from demo.mrc import mrc_metrics

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)


Expand Down Expand Up @@ -84,7 +83,7 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokeniz

max_steps = len(train_features) * args.epoch // args.bsz
opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd)
g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental

train_dataset = train_dataset \
.repeat() \
Expand Down
8 changes: 4 additions & 4 deletions demo/finetune_ner_dygraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
import propeller.paddle as propeller

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)

from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification
Expand Down Expand Up @@ -127,13 +126,14 @@ def before(seg, label):
test_ds.data_shapes = shapes
test_ds.data_types = types

with FD.guard():
place = F.CUDAPlace(0)
with FD.guard(place):
model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=7, name='')

opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01)
#opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters())
for epoch in range(args.epoch):
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start())):
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))):
loss, _ = model(ids, sids, labels=aligned_label)
loss.backward()
if step % 10 == 0 :
Expand All @@ -144,7 +144,7 @@ def before(seg, label):
all_pred, all_label = [], []
with FD.base._switch_tracer_mode_guard_(is_train=False):
model.eval()
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dev_ds.start())):
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dev_ds.start(place))):
loss, logits = model(ids, sids, labels=aligned_label)
#print('\n'.join(map(str, logits.numpy().tolist())))

Expand Down
3 changes: 1 addition & 2 deletions demo/finetune_sentiment_analysis_dygraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import propeller.paddle as propeller

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)
log = logging.getLogger()

Expand Down Expand Up @@ -101,7 +100,7 @@ def map_fn(seg_a, label):
int(args.warmup_proportion * args.max_steps), args.max_steps),
parameter_list=model.parameters(),
weight_decay=args.wd)
g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
for epoch in range(args.epoch):
for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
ids, sids, label = d
Expand Down
1 change: 0 additions & 1 deletion demo/pretrain/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
from propeller import log

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)

if six.PY3:
Expand Down
1 change: 0 additions & 1 deletion demo/pretrain/pretrain_dygraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
from propeller import log

log.setLevel(logging.DEBUG)
logging.getLogger().addHandler(log.handlers[0])
logging.getLogger().setLevel(logging.DEBUG)

if six.PY3:
Expand Down
4 changes: 2 additions & 2 deletions distill/distill.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def evaluate_teacher(model, dataset):
teacher_model.train()
if not os.path.exists('./teacher_model.pdparams'):
opt = AdamW(learning_rate=LinearDecay(LR, 9600*EPOCH*0.1/BATCH, 9600*EPOCH/BATCH), parameter_list=teacher_model.parameters(), weight_decay=0.01)
g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0)
g_clip = F.clip.GradientClipByGlobalNorm(1.0)
for epoch in range(EPOCH):
for step, (ids_student, ids, sids, labels) in enumerate(train_ds.start(place)):
loss, logits = teacher_model(ids, labels=labels)
Expand Down Expand Up @@ -200,7 +200,7 @@ def KL(pred, target):
teacher_model.eval()
model = BOW()
opt = AdamW(learning_rate=LR, parameter_list=model.parameters(), weight_decay=0.01)
g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
model.train()
for epoch in range(EPOCH):
for step, (ids_student, ids, sids, _ ) in enumerate(train_ds.start(place)):
Expand Down
Loading

0 comments on commit 241c028

Please sign in to comment.