Skip to content

Commit

Permalink
add multi-doc support and update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
LiyuanLucasLiu committed Sep 20, 2017
1 parent c695bfb commit 7e65fd3
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 26 deletions.
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
[![Documentation Status](https://readthedocs.org/projects/lm-lstm-crf/badge/?version=latest)](http://lm-lstm-crf.readthedocs.io/en/latest/?badge=latest)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

This project provides high-performance character-aware sequence labeling tools and tutorials. Model details can be accessed [here](http://arxiv.org/abs/1709.04109), and the implementation is based on the PyTorch library.
This project provides high-performance character-aware sequence labeling tools, including [Training](#usage), [Evaluation](#evaluation) and [Prediction](#prediction).

LM-LSTM-CRF achieves F1 score of 91.71+/-0.10 on the CoNLL 2003 NER dataset, without using any additional corpus or resource.
Details about LM-LSTM-CRF can be accessed [here](http://arxiv.org/abs/1709.04109), and the implementation is based on the PyTorch library. Our model achieves F1 score of 91.71+/-0.10 on the CoNLL 2003 NER dataset, without using any additional corpus or resource.

The documents would be available [here](http://lm-lstm-crf.readthedocs.io/en/latest/).

Expand Down Expand Up @@ -202,6 +202,14 @@ to
newcomers
Uzbekistan
.
```
and the corresponding output is:

```
-DOCSTART- -DOCSTART- -DOCSTART-
But <LOC> China </LOC> saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers <LOC> Uzbekistan </LOC> .
```

## Reference
Expand Down
26 changes: 15 additions & 11 deletions model/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def decode_s(self, feature, label):

return chunks

def output_batch(self, ner_model, features, fout):
def output_batch(self, ner_model, documents, fout):
"""
decode the whole corpus in the specific format by calling apply_model to fit specific models
Expand All @@ -123,18 +123,22 @@ def output_batch(self, ner_model, features, fout):
feature (list): list of words list
fout: output file
"""
f_len = len(features)
d_len = len(documents)

for ind in tqdm( range(0, f_len, self.batch_size), mininterval=1,
for d_ind in tqdm( range(0, d_len), mininterval=1,
desc=' - Process', leave=False, file=sys.stdout):
eind = min(f_len, ind + self.batch_size)
labels = self.apply_model(ner_model, features[ind: eind])
labels = torch.unbind(labels, 1)

for ind2 in range(ind, eind):
f = features[ind2]
l = labels[ind2 - ind][0: len(f) ]
fout.write(self.decode_str(features[ind2], l) + '\n\n')
fout.write('-DOCSTART- -DOCSTART- -DOCSTART-\n\n')
features = documents[d_ind]
f_len = len(features)
for ind in range(0, f_len, self.batch_size):
eind = min(f_len, ind + self.batch_size)
labels = self.apply_model(ner_model, features[ind: eind])
labels = torch.unbind(labels, 1)

for ind2 in range(ind, eind):
f = features[ind2]
l = labels[ind2 - ind][0: len(f) ]
fout.write(self.decode_str(features[ind2], l) + '\n\n')

def apply_model(self, ner_model, features):
"""
Expand Down
48 changes: 35 additions & 13 deletions model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,23 +239,45 @@ def read_corpus(lines):

return features, labels

def read_features(lines):
def read_features(lines, multi_docs = True):
"""
convert un-annotated corpus into features
"""
features = list()
tmp_fl = list()
for line in lines:
if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
line = line.rstrip()
tmp_fl.append(line)
elif len(tmp_fl) > 0:
if multi_docs:
documents = list()
features = list()
tmp_fl = list()
for line in lines:
if_doc_end = (len(line) > 10 and line[0:10] == '-DOCSTART-')
if not (line.isspace() or if_doc_end):
line = line.rstrip()
tmp_fl.append(line)
else:
if len(tmp_fl) > 0:
features.append(tmp_fl)
tmp_fl = list()
if if_doc_end and len(features) > 0:
documents.append(features)
features = list()
if len(tmp_fl) > 0:
features.append(tmp_fl)
tmp_fl = list()
if len(tmp_fl) > 0:
features.append(tmp_fl)

return features
if len(features) >0:
documents.append(features)
return documents
else:
features = list()
tmp_fl = list()
for line in lines:
if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
line = line.rstrip()
tmp_fl.append(line)
elif len(tmp_fl) > 0:
features.append(tmp_fl)
tmp_fl = list()
if len(tmp_fl) > 0:
features.append(tmp_fl)

return features

def shrink_embedding(feature_map, word_dict, word_embedding, caseless):
"""
Expand Down

0 comments on commit 7e65fd3

Please sign in to comment.