update NER results

LiyuanLucasLiu · Jan 31, 2018 · f0fed57 · f0fed57
1 parent 0f2b5dd
commit f0fed57
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -101,12 +101,12 @@ The default running commands for NER and POS tagging, and NP Chunking are:
 
 - Named Entity Recognition (NER):
 ```
-python train_wc.py --train_file ./data/ner/train.txt --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt --checkpoint ./checkpoint/ner_ --caseless --fine_tune --high_way --co_train
+python train_wc.py --train_file ./data/ner/train.txt --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt --checkpoint ./checkpoint/ner_ --caseless --fine_tune --high_way --co_train --least_iters 100
 ```
 
 - Part-of-Speech (POS) Tagging:
 ```
-python train_wc.py --train_file ./data/pos/train.txt --dev_file ./data/pos/testa.txt --test_file ./data/pos/testb.txt --eva_matrix a --checkpoint ./checkpoint/pos_ --lr 0.015 --caseless --fine_tune --high_way --co_train
+python train_wc.py --train_file ./data/pos/train.txt --dev_file ./data/pos/testa.txt --test_file ./data/pos/testb.txt --eva_matrix a --checkpoint ./checkpoint/pos_ --caseless --fine_tune --high_way --co_train
 ```
 
 - Noun Phrase (NP) Chunking:
@@ -120,7 +120,17 @@ For other datasets or tasks, you may wanna try different stopping parameters, es
 
 Here we compare LM-LSTM-CRF with recent state-of-the-art models on the CoNLL 2000 Chunking dataset, the CoNLL 2003 NER dataset, and the WSJ portion of the PTB POS Tagging dataset. All experiments are conducted on a GTX 1080 GPU.
 
-A serious bug was found on the ```bioes_to_span``` function, we are doing experiments and would update the results on NER & Chunking later.
+A serious bug was found on the ```bioes_to_span``` function, we are doing experiments and would update the results of NER & Chunking later.
+
+### NER
+
+When models are only trained on the WSJ portion of the PTB POS Tagging dataset, the results are summarized as below.
+
+|Model | Max(Acc) | Mean(Acc) | Std(Acc) | Time(h) |
+| ------------- |-------------| -----| -----| ---- |
+| LM-LSTM-CRF | **91.35** | **91.24** | 0.12 | 4 |
+| -- HighWay | 90.87 | 90.79 | 0.07 | 4 |
+| -- Co-Train | 91.23 | 90.95 | 0.34 | 2 |
 
 ### POS
 
@@ -135,18 +145,19 @@ When models are only trained on the WSJ portion of the PTB POS Tagging dataset,
 ## Pretrained Model
 
 ### Evaluation
-We released pre-trained model on these three tasks. The checkpoint file can be downloaded at:
 
-| WSJ-PTB POS Tagging |
-| ------------------- |
-| [Args](https://drive.google.com/a/illinois.edu/file/d/0B587SdKqutQmN1UwNjhHQkhUWEk/view?usp=sharing) |
-| [Model](https://drive.google.com/a/illinois.edu/file/d/0B587SdKqutQmSDlJRGRNandhMGs/view?usp=sharing) |
+We released pre-trained models on these three tasks. The checkpoint file can be downloaded at the following links. Notice that the NER model and Chunking model (coming soon) are trained on both the training set and the development set:
 
+| WSJ-PTB POS Tagging |  CoNLL03 NER |
+| ------------------- |
+| [Args](https://drive.google.com/a/illinois.edu/file/d/0B587SdKqutQmN1UwNjhHQkhUWEk/view?usp=sharing) | [Args](https://drive.google.com/file/d/1tGAQ0hu9AsIBdrqFn5fmDQ72Pk1I-o74/view?usp=sharing) | 
+| [Model](https://drive.google.com/a/illinois.edu/file/d/0B587SdKqutQmSDlJRGRNandhMGs/view?usp=sharing) | [Model](https://drive.google.com/file/d/1o9kjZV5EcHAhys3GPgl7EPGE5fuXyYjr/view?usp=sharing) | 
 
 Also, ```eval_wc.py``` is provided to load and run these checkpoints. Its usage can be accessed by command ````python eval_wc.py -h````, and a running command example is provided below:
 ```
 python eval_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt
 ```
+
 ### Prediction
 
 To annotated raw text, ```seq_wc.py``` is provided to annotate un-annotated text. Its usage can be accessed by command ````python seq_wc.py -h````, and a running command example is provided below:

diff --git a/eval_w.py b/eval_w.py
@@ -26,7 +26,6 @@
     parser.add_argument('--load_check_point', default='./checkpoint/soa/check_wc_p_char_lstm_crf.model', help='checkpoint path')
     parser.add_argument('--gpu',type=int, default=0, help='gpu id')
     parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone')
-    parser.add_argument('--dev_file', default='', help='path to development file, if set to none, would use dev_file path in the checkpoint file')
     parser.add_argument('--test_file', default='', help='path to test file, if set to none, would use test_file path in the checkpoint file')
     args = parser.parse_args()
 
@@ -42,13 +41,7 @@
 
 
     # load corpus
-    if args.dev_file:
-        with codecs.open(args.dev_file, 'r', 'utf-8') as f:
-            dev_lines = f.readlines()
-    else:
-        with codecs.open(jd['dev_file'], 'r', 'utf-8') as f:
-            dev_lines = f.readlines()
-
+
     if args.test_file:
         with codecs.open(args.test_file, 'r', 'utf-8') as f:
             test_lines = f.readlines()
@@ -58,14 +51,11 @@
 
     # converting format
 
-    dev_features, dev_labels = utils.read_corpus(dev_lines)
     test_features, test_labels = utils.read_corpus(test_lines)
 
     # construct dataset
-    dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, f_map, l_map, jd['caseless'])
     test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, jd['caseless'])
 
-    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
     test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]
 
     # build model
@@ -86,16 +76,12 @@
 
     if 'f' in args.eva_matrix:
 
-        dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader)
-
         test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
 
-        print(jd['checkpoint'] + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (dev_f1, dev_rec, dev_pre, dev_acc, test_f1, test_rec, test_pre, test_acc))
+        print(jd['checkpoint'] + ' test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (test_f1, test_rec, test_pre, test_acc))
 
     else:
 
-        dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader)
-
         test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
 
-        print(jd['checkpoint'] + ' dev_acc: %.4f test_acc: %.4f\n' % (dev_acc, test_acc))
+        print(jd['checkpoint'] + ' test_acc: %.4f\n' % (test_acc))
diff --git a/eval_wc.py b/eval_wc.py
@@ -26,7 +26,6 @@
     parser.add_argument('--load_check_point', default='./checkpoint/soa/check_wc_p_char_lstm_crf.model', help='path to model checkpoint file')
     parser.add_argument('--gpu',type=int, default=0, help='gpu id')
     parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or f1 alone')
-    parser.add_argument('--dev_file', default='', help='path to development file, if set to none, would use dev_file path in the checkpoint file')
     parser.add_argument('--test_file', default='', help='path to test file, if set to none, would use test_file path in the checkpoint file')
     args = parser.parse_args()
 
@@ -44,13 +43,6 @@
 
 
     # load corpus
-    if args.dev_file:
-        with codecs.open(args.dev_file, 'r', 'utf-8') as f:
-            dev_lines = f.readlines()
-    else:
-        with codecs.open(jd['dev_file'], 'r', 'utf-8') as f:
-            dev_lines = f.readlines()
-
     if args.test_file:
         with codecs.open(args.test_file, 'r', 'utf-8') as f:
             test_lines = f.readlines()
@@ -60,14 +52,11 @@
 
     # converting format
 
-    dev_features, dev_labels = utils.read_corpus(dev_lines)
     test_features, test_labels = utils.read_corpus(test_lines)
 
     # construct dataset
-    dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, jd['caseless'])
     test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, jd['caseless'])
 
-    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
     test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]
 
     # build model
@@ -89,17 +78,13 @@
     print('start')
     if 'f' in args.eva_matrix:
 
-        dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader)
-
         test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
 
-        print(jd['checkpoint'] + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (dev_f1, dev_rec, dev_pre, dev_acc, test_f1, test_rec, test_pre, test_acc))
+        print(jd['checkpoint'] + ' test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (test_f1, test_rec, test_pre, test_acc))
 
     else:
 
-        dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader)
-
         test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
 
-        print(jd['checkpoint'] + ' dev_acc: %.4f test_acc: %.4f\n' % (dev_acc, test_acc))
+        print(jd['checkpoint'] + ' test_acc: %.4f\n' % (test_acc))
     print('end')
diff --git a/train_w.py b/train_w.py
@@ -31,14 +31,14 @@
     parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding')
     parser.add_argument('--checkpoint', default='./checkpoint/', help='path to checkpoint prefix')
     parser.add_argument('--hidden', type=int, default=100, help='hidden dimension')
-    parser.add_argument('--drop_out', type=float, default=0.5, help='dropout ratio')
+    parser.add_argument('--drop_out', type=float, default=0.55, help='dropout ratio')
     parser.add_argument('--epoch', type=int, default=200, help='maximum epoch number')
     parser.add_argument('--start_epoch', type=int, default=0, help='start epoch idx')
     parser.add_argument('--caseless', action='store_true', help='caseless or not')
     parser.add_argument('--embedding_dim', type=int, default=100, help='dimension for word embedding')
     parser.add_argument('--layers', type=int, default=1, help='number of lstm layers')
-    parser.add_argument('--lr', type=float, default=0.01, help='initial learning rate')
-    parser.add_argument('--lr_decay', type=float, default=0.001, help='decay ratio of learning rate')
+    parser.add_argument('--lr', type=float, default=0.015, help='initial learning rate')
+    parser.add_argument('--lr_decay', type=float, default=0.05, help='decay ratio of learning rate')
     parser.add_argument('--fine_tune', action='store_false', help='fine tune pre-trained embedding dictionary')
     parser.add_argument('--load_check_point', default='', help='path of checkpoint')
     parser.add_argument('--load_opt', action='store_true', help='load optimizer from ')

diff --git a/train_wc.py b/train_wc.py
@@ -34,7 +34,7 @@ def eprint(*args, **kwargs):
     parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding')
     parser.add_argument('--char_hidden', type=int, default=300, help='dimension of char-level layers')
     parser.add_argument('--word_hidden', type=int, default=300, help='dimension of word-level layers')
-    parser.add_argument('--drop_out', type=float, default=0.5, help='dropout ratio')
+    parser.add_argument('--drop_out', type=float, default=0.55, help='dropout ratio')
     parser.add_argument('--epoch', type=int, default=200, help='maximum epoch number')
     parser.add_argument('--start_epoch', type=int, default=0, help='start point of epoch')
     parser.add_argument('--checkpoint', default='./checkpoint/', help='checkpoint path')
@@ -43,7 +43,7 @@ def eprint(*args, **kwargs):
     parser.add_argument('--word_dim', type=int, default=100, help='dimension of word embedding')
     parser.add_argument('--char_layers', type=int, default=1, help='number of char level layers')
     parser.add_argument('--word_layers', type=int, default=1, help='number of word level layers')
-    parser.add_argument('--lr', type=float, default=0.01, help='initial learning rate')
+    parser.add_argument('--lr', type=float, default=0.015, help='initial learning rate')
     parser.add_argument('--lr_decay', type=float, default=0.05, help='decay ratio of learning rate')
     parser.add_argument('--fine_tune', action='store_false', help='fine tune the diction of word embedding or not')
     parser.add_argument('--load_check_point', default='', help='path previous checkpoint that want to be loaded')