Dygraph fix4 (PaddlePaddle#464)

* upgrade version no * seq2seq + length penalty * upgrade to paddle 1.8 * fix readme * update seq2seq to 1.8 * fix seq2seq beam-search * + aisudio tutorial: loading old-styled checkpoint * add mising seq2seq eval file for cnndm * fix seq2seq decode post process * + handlers
Meiyim · May 27, 2020 · 241c028 · 241c028
1 parent 655cf25
commit 241c028
Show file tree

Hide file tree

Showing 15 changed files with 112 additions and 57 deletions.
diff --git a/README.en.md b/README.en.md
@@ -69,10 +69,11 @@ Don't have GPU? try ERNIE in [AIStudio](https://aistudio.baidu.com/aistudio/inde
 (please choose the latest version and apply for a GPU environment)
 
 1. [ERNIE for beginners](https://aistudio.baidu.com/studio/edu/group/quick/join/314947)
-1. [Sementic Analysis](https://aistudio.baidu.com/aistudio/projectdetail/427482)
-2. [Cloze Test](https://aistudio.baidu.com/aistudio/projectdetail/433491)
-3. [Knowledge Distillation](https://aistudio.baidu.com/aistudio/projectdetail/439460)
-4. [Ask Ernie](https://aistudio.baidu.com/aistudio/projectdetail/456443)
+1. [Sementic analysis](https://aistudio.baidu.com/aistudio/projectdetail/427482)
+2. [Cloze test](https://aistudio.baidu.com/aistudio/projectdetail/433491)
+3. [Knowledge distillation](https://aistudio.baidu.com/aistudio/projectdetail/439460)
+4. [Ask ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/456443)
+5. [Loading old-styled checkpoint](https://aistudio.baidu.com/aistudio/projectdetail/493415)
 
 # Setup
 

diff --git a/README.zh.md b/README.zh.md
@@ -70,6 +70,7 @@ print(pooled.numpy())                        # convert  results to numpy
 2. [完形填空](https://aistudio.baidu.com/aistudio/projectdetail/433491)
 3. [知识蒸馏](https://aistudio.baidu.com/aistudio/projectdetail/439460)
 4. [万事不决问ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/456443)
+5. [加载并读取老式checkpoint](https://aistudio.baidu.com/aistudio/projectdetail/493415)
 
 # 安装
 
@@ -230,7 +231,7 @@ sids = np.expand_dims(sids, 0)
 result = client(ids, sids)
 ```
 
-你也可从[此处]((https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz.)下载一个预先制作好的ernie-1.0 base模型的 `inference_model`.
+你也可从[此处](https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz.)下载一个预先制作好的ernie-1.0 base模型的 `inference_model`.
 该模型没有经过finetune，一般可以用做上层模型结构的 feature-base finetune或者做为一个文本特征抽取器。
 因为该模行由老版API 产出，在进行客户端请求时需要在输入tensor后面追加一个维度：
 

diff --git a/demo/finetune_classifier.py b/demo/finetune_classifier.py
@@ -43,7 +43,6 @@
 
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 def model_fn(features, mode, params, run_config):

diff --git a/demo/finetune_classifier_dygraph.py b/demo/finetune_classifier_dygraph.py
@@ -34,7 +34,6 @@
 import propeller.paddle as propeller
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 
@@ -104,20 +103,20 @@ def map_fn(seg_a, seg_b, label):
     with FD.guard(place):
         model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='')
 
+        g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
         if args.use_lr_decay:
-            opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd)
+            opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)
         else:
-            opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd)
+            opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)
 
-        g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
         for epoch in range(args.epoch):
             for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
                 ids, sids, label = d
                 loss, _ = model(ids, sids, labels=label)
                 loss.backward()
                 if step % 10 == 0:
                     log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr()))
-                opt.minimize(loss, grad_clip=g_clip)
+                opt.minimize(loss)
                 model.clear_gradients()
                 if step % 100 == 0:
                     acc = []

diff --git a/demo/finetune_classifier_dygraph_distributed.py b/demo/finetune_classifier_dygraph_distributed.py
@@ -33,7 +33,6 @@
 import propeller.paddle as propeller
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 
@@ -102,15 +101,22 @@ def map_fn(seg_a, seg_b, label):
         model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='')
         model = FD.parallel.DataParallel(model, ctx)
 
-        opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd)
-        g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
+        g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
+        opt = AdamW(learning_rate=LinearDecay(
+                    args.lr, 
+                    int(args.warmup_proportion * args.max_steps), 
+                    args.max_steps), 
+                parameter_list=model.parameters(), 
+                weight_decay=args.wd, 
+                grad_clip=g_clip)
+
         for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
             ids, sids, label = d
             loss, _ = model(ids, sids, labels=label)
             scaled_loss = model.scale_loss(loss)
             scaled_loss.backward()
             model.apply_collective_grads()
-            opt.minimize(scaled_loss, grad_clip=g_clip)
+            opt.minimize(scaled_loss)
             model.clear_gradients()
             if step % 10 == 0:
                 log.debug('train loss %.5f, lr %.e3' % (loss.numpy(), opt.current_step_lr()))

diff --git a/demo/finetune_mrc_dygraph.py b/demo/finetune_mrc_dygraph.py
@@ -48,7 +48,6 @@
 from demo.mrc import mrc_metrics
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 
@@ -84,7 +83,7 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokeniz
 
     max_steps = len(train_features) * args.epoch // args.bsz
     opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd)
-    g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
+    g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
 
     train_dataset = train_dataset \
             .repeat() \

diff --git a/demo/finetune_ner_dygraph.py b/demo/finetune_ner_dygraph.py
@@ -39,7 +39,6 @@
 import propeller.paddle as propeller
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification
@@ -127,13 +126,14 @@ def before(seg, label):
     test_ds.data_shapes = shapes
     test_ds.data_types = types
 
-    with FD.guard():
+    place = F.CUDAPlace(0)
+    with FD.guard(place):
         model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=7, name='')
 
         opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01)
         #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters())
         for epoch in range(args.epoch):
-            for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start())):
+            for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))):
                 loss, _ = model(ids, sids, labels=aligned_label)
                 loss.backward()
                 if step % 10 == 0 :
@@ -144,7 +144,7 @@ def before(seg, label):
                     all_pred, all_label = [], []
                     with FD.base._switch_tracer_mode_guard_(is_train=False):
                         model.eval()
-                        for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dev_ds.start())):
+                        for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dev_ds.start(place))):
                             loss, logits = model(ids, sids, labels=aligned_label)
                             #print('\n'.join(map(str, logits.numpy().tolist())))
 

diff --git a/demo/finetune_sentiment_analysis_dygraph.py b/demo/finetune_sentiment_analysis_dygraph.py
@@ -34,7 +34,6 @@
 import propeller.paddle as propeller
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 log = logging.getLogger()
 
@@ -101,7 +100,7 @@ def map_fn(seg_a, label):
                 int(args.warmup_proportion * args.max_steps), args.max_steps), 
                 parameter_list=model.parameters(), 
                 weight_decay=args.wd)
-            g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
+            g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
             for epoch in range(args.epoch):
                 for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
                     ids, sids, label = d

diff --git a/demo/pretrain/pretrain.py b/demo/pretrain/pretrain.py
@@ -48,7 +48,6 @@
 from propeller import log
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 if six.PY3:

diff --git a/demo/pretrain/pretrain_dygraph.py b/demo/pretrain/pretrain_dygraph.py
@@ -49,7 +49,6 @@
 from propeller import log
 
 log.setLevel(logging.DEBUG)
-logging.getLogger().addHandler(log.handlers[0])
 logging.getLogger().setLevel(logging.DEBUG)
 
 if six.PY3:

diff --git a/distill/distill.py b/distill/distill.py
@@ -100,7 +100,7 @@ def evaluate_teacher(model, dataset):
 teacher_model.train()
 if not os.path.exists('./teacher_model.pdparams'):
     opt = AdamW(learning_rate=LinearDecay(LR, 9600*EPOCH*0.1/BATCH, 9600*EPOCH/BATCH), parameter_list=teacher_model.parameters(), weight_decay=0.01)
-    g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0)
+    g_clip = F.clip.GradientClipByGlobalNorm(1.0)
     for epoch in range(EPOCH):
         for step, (ids_student, ids, sids, labels) in enumerate(train_ds.start(place)):
             loss, logits = teacher_model(ids, labels=labels)
@@ -200,7 +200,7 @@ def KL(pred, target):
 teacher_model.eval()
 model = BOW()
 opt = AdamW(learning_rate=LR, parameter_list=model.parameters(), weight_decay=0.01)
-g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0) #experimental
+g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
 model.train()
 for epoch in range(EPOCH):
     for step, (ids_student, ids, sids, _ ) in enumerate(train_ds.start(place)):