Skip to content

Commit

Permalink
Merge pull request #29 from IndicoDataSolutions/ben/add_sequence_tasks
Browse files Browse the repository at this point in the history
Ben/add sequence tasks
  • Loading branch information
madisonmay committed Sep 13, 2018
2 parents edffa89 + f86ec61 commit 484e32c
Show file tree
Hide file tree
Showing 33 changed files with 1,020 additions and 187 deletions.
1 change: 1 addition & 0 deletions enso/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from enso.featurize import Featurizer
from enso.visualize import Visualizer
from enso.sample import Sampler
from enso.resample import Resampler
from enso.metrics import Metric


Expand Down
45 changes: 36 additions & 9 deletions enso/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import indicoio
from enso.mode import ModeKeys
import multiprocessing

"""Constants to configure the rest of Enso."""
Expand All @@ -13,40 +14,61 @@
FEATURES_DIRECTORY = "Features"

# Directory for storing experiment results
EXPERIMENT_NAME = "Demo"
EXPERIMENT_NAME = "TestSamplersSequence"

# Datasets to featurize or run experiments on
DATA = {
# Classification
'Classify/AirlineSentiment',
'Classify/MovieReviews',
'Classify/MPQA',
'Classify/PoliticalTweetSubjectivity',

# Seqence

# 'SequenceLabeling/Reuters-128',
# 'SequenceLabeling/brown_all',
# 'SequenceLabeling/brown_nouns',
# 'SequenceLabeling/brown_verbs',
# 'SequenceLabeling/brown_pronouns',
# 'SequenceLabeling/brown_adverbs',
}

# Featurizers to activate
FEATURIZERS = {
"PlainTextFeaturizer",
# "IndicoStandard",
"SpacyGloveFeaturizer",
"SpacyCNNFeaturizer"
# "SpacyCNNFeaturizer",
}

# Experiments to run
EXPERIMENTS = {
"LogisticRegressionCV",
# "FinetuneSequenceLabel",
# "IndicoSequenceLabel"
# "Finetune",
# "SpacyGlove"
"LogisticRegressionCV"
}

# Metrics to compute
METRICS = {
# "OverlapAccuracy",
# "OverlapPrecision",
# "OverlapRecall",
"Accuracy",
"MacroRocAuc",
}

# Test setup metadata
TEST_SETUP = {
"train_sizes": range(50, 550, 50),
"n_splits": 25,
"train_sizes": range(50, 500, 50),
"n_splits": 2,
# "samplers": ['RandomSequence', 'NoSampler'],
"samplers": ['Random'],
"sampling_size": .3,
"resamplers": ['RandomOverSampler']
# "resamplers": ["SequenceOverSampler", 'NoResampler']
"resamplers": ["NoResampler", "RandomOverSampler"]
}

# Visualizations to display
Expand All @@ -63,13 +85,18 @@
'y_tile': 'Dataset',
'x_axis': 'TrainSize',
'y_axis': 'Result',
'lines': ['Experiment', 'Featurizer'],
'lines': ['Experiment', 'Featurizer', "Sampler", "Resampler"],
'category': 'merge',
'cv': 'mean',
'filename': 'TestResult'
}
}

MODE = ModeKeys.CLASSIFY

N_GPUS = 1
N_CORES = 1 # multiprocessing.cpu_count()

FIX_REQUIREMENTS = True

N_GPUS = 3
N_CORES = 1 # multiprocessing.cpu_count()
indicoio.config.api_key = ""
6 changes: 3 additions & 3 deletions enso/download/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from bs4 import BeautifulSoup

from enso import config
from enso.mode import ModeKeys


def generic_download(url, text_column, target_column, filename, save=True, task_type='Classify', text_transformation=None, target_transformation=None):
def generic_download(url, text_column, target_column, filename, save=True, task_type=ModeKeys.CLASSIFY, text_transformation=None, target_transformation=None):

save_path = os.path.join(config.DATA_DIRECTORY, task_type, filename)
save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename)
if os.path.exists(save_path):
print("{} already downloaded, skipping...".format(filename))
return
Expand All @@ -32,6 +33,5 @@ def generic_download(url, text_column, target_column, filename, save=True, task_

return new_df


def html_to_text(text):
return BeautifulSoup(text, "html5lib").get_text()
75 changes: 75 additions & 0 deletions enso/download/brown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
import json
import re

import nltk
from nltk.corpus import brown

from tqdm import tqdm

from enso import config
from enso.mode import ModeKeys

only_tags_with = "N"


def label_preproc(label, regex, whole_tag=False):
for sublabel in reversed(re.split("[+,-]", label)):
if re.match(regex, sublabel):
if whole_tag:
return label
return sublabel
return None


def brown_corpus_tags(task_name, tag_regex, whole_tag=False):
task_type = ModeKeys.SEQUENCE
filename = "brown_{}.json".format(task_name)
save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename)

docs = []
for tagged_sent in tqdm(brown.tagged_sents(), desc=task_name):
doc_text = ""
doc_annotations = []
last_label = []
for sub_str, label in tagged_sent:
label = label_preproc(label, tag_regex, whole_tag)

if doc_text:
doc_text += " "

doc_location = len(doc_text)
doc_text += sub_str
doc_end = len(doc_text)

if doc_annotations and label is not None and label == last_label:
doc_annotations[-1]["end"] = doc_end

elif label is not None:
doc_annotations.append(
{
"start": doc_location,
"end": doc_location + len(sub_str),
"label": label
}
)

last_label = label
docs.append([doc_text, doc_annotations])

with open(save_path, "wt") as fp:
json.dump(docs, fp, indent=1)


if __name__ == "__main__":
nltk.download("brown")
brown_tasks = [
("nouns", r'^N[A-Z]*', False),
("verbs", r'^V[A-Z]*', False),
("adverbs", r'^R[A-Z]*', False),
("pronouns", r'^P[A-Z]*', False),
("all", r'[A-Z]*', False)
]

for task in brown_tasks:
brown_corpus_tags(*task)
48 changes: 48 additions & 0 deletions enso/download/reuters-128.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import requests

from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import json
from nltk.tokenize import sent_tokenize

from enso import config
from enso.mode import ModeKeys

if __name__ == "__main__":
task_type = ModeKeys.SEQUENCE
filename = "Reuters-128.json"
save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename)
if os.path.exists(save_path):
print("{} already downloaded, skipping...".format(filename))
else:
url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
r = requests.get(url)
soup = bs(r.content.decode("utf-8"), "html5lib")
docs = []
for elem in soup.find_all("document"):
single_entry = ["", []]
for c in elem.find("textwithnamedentities").children:
if type(c) == Tag:
sent_parts = sent_tokenize(c.text)
if len(sent_parts) == 1:
sent_parts = [c.text]

for i, text in enumerate(sent_parts):
if i == 1:
docs.append(single_entry)
single_entry = ["", []]

if c.name == "namedentityintext":
single_entry[1].append(
{
"start": len(single_entry[0]),
"end": len(single_entry[0]) + len(c.text),
"label": "NAME"
}
)
single_entry[0] += text

docs.append(single_entry)
with open(save_path, "wt") as fp:
json.dump(docs, fp, indent=1)
9 changes: 8 additions & 1 deletion enso/experiment/NB.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

from enso.experiment import Experiment

from enso.registry import Registry, ModeKeys


@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "not PlainTextFeaturizer")])
class NaiveGaussianBayes(Experiment):
"""Basic implementation of a grid-search optimized Logistic Regression."""

Expand All @@ -25,6 +28,8 @@ def predict(self, dataset):
probabilities = self.active_model.predict_proba(dataset)
return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})


@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "not PlainTextFeaturizer")])
class NaiveMultinomialBayes(Experiment):
"""Basic implementation of a grid-search optimized Logistic Regression."""

Expand All @@ -48,6 +53,8 @@ def predict(self, dataset):
probabilities = self.active_model.predict_proba(dataset)
return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})


@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "not PlainTextFeaturizer")])
class NativeBernoulliBayes(Experiment):
"""Basic implementation of a grid-search optimized Logistic Regression."""

Expand All @@ -69,4 +76,4 @@ def predict(self, dataset):
"""Predict results on test set based on current internal model."""
labels = self.active_model.classes_
probabilities = self.active_model.predict_proba(dataset)
return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})
return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})

0 comments on commit 484e32c

Please sign in to comment.