-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from IndicoDataSolutions/ben/add_sequence_tasks
Ben/add sequence tasks
- Loading branch information
Showing
33 changed files
with
1,020 additions
and
187 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import os | ||
import json | ||
import re | ||
|
||
import nltk | ||
from nltk.corpus import brown | ||
|
||
from tqdm import tqdm | ||
|
||
from enso import config | ||
from enso.mode import ModeKeys | ||
|
||
only_tags_with = "N" | ||
|
||
|
||
def label_preproc(label, regex, whole_tag=False): | ||
for sublabel in reversed(re.split("[+,-]", label)): | ||
if re.match(regex, sublabel): | ||
if whole_tag: | ||
return label | ||
return sublabel | ||
return None | ||
|
||
|
||
def brown_corpus_tags(task_name, tag_regex, whole_tag=False): | ||
task_type = ModeKeys.SEQUENCE | ||
filename = "brown_{}.json".format(task_name) | ||
save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename) | ||
|
||
docs = [] | ||
for tagged_sent in tqdm(brown.tagged_sents(), desc=task_name): | ||
doc_text = "" | ||
doc_annotations = [] | ||
last_label = [] | ||
for sub_str, label in tagged_sent: | ||
label = label_preproc(label, tag_regex, whole_tag) | ||
|
||
if doc_text: | ||
doc_text += " " | ||
|
||
doc_location = len(doc_text) | ||
doc_text += sub_str | ||
doc_end = len(doc_text) | ||
|
||
if doc_annotations and label is not None and label == last_label: | ||
doc_annotations[-1]["end"] = doc_end | ||
|
||
elif label is not None: | ||
doc_annotations.append( | ||
{ | ||
"start": doc_location, | ||
"end": doc_location + len(sub_str), | ||
"label": label | ||
} | ||
) | ||
|
||
last_label = label | ||
docs.append([doc_text, doc_annotations]) | ||
|
||
with open(save_path, "wt") as fp: | ||
json.dump(docs, fp, indent=1) | ||
|
||
|
||
if __name__ == "__main__": | ||
nltk.download("brown") | ||
brown_tasks = [ | ||
("nouns", r'^N[A-Z]*', False), | ||
("verbs", r'^V[A-Z]*', False), | ||
("adverbs", r'^R[A-Z]*', False), | ||
("pronouns", r'^P[A-Z]*', False), | ||
("all", r'[A-Z]*', False) | ||
] | ||
|
||
for task in brown_tasks: | ||
brown_corpus_tags(*task) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
import requests | ||
|
||
from bs4 import BeautifulSoup as bs | ||
from bs4.element import Tag | ||
import json | ||
from nltk.tokenize import sent_tokenize | ||
|
||
from enso import config | ||
from enso.mode import ModeKeys | ||
|
||
if __name__ == "__main__": | ||
task_type = ModeKeys.SEQUENCE | ||
filename = "Reuters-128.json" | ||
save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename) | ||
if os.path.exists(save_path): | ||
print("{} already downloaded, skipping...".format(filename)) | ||
else: | ||
url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml" | ||
r = requests.get(url) | ||
soup = bs(r.content.decode("utf-8"), "html5lib") | ||
docs = [] | ||
for elem in soup.find_all("document"): | ||
single_entry = ["", []] | ||
for c in elem.find("textwithnamedentities").children: | ||
if type(c) == Tag: | ||
sent_parts = sent_tokenize(c.text) | ||
if len(sent_parts) == 1: | ||
sent_parts = [c.text] | ||
|
||
for i, text in enumerate(sent_parts): | ||
if i == 1: | ||
docs.append(single_entry) | ||
single_entry = ["", []] | ||
|
||
if c.name == "namedentityintext": | ||
single_entry[1].append( | ||
{ | ||
"start": len(single_entry[0]), | ||
"end": len(single_entry[0]) + len(c.text), | ||
"label": "NAME" | ||
} | ||
) | ||
single_entry[0] += text | ||
|
||
docs.append(single_entry) | ||
with open(save_path, "wt") as fp: | ||
json.dump(docs, fp, indent=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.