Merge pull request #29 from IndicoDataSolutions/ben/add_sequence_tasks

Ben/add sequence tasks
IndicoDataSolutions · Sep 13, 2018 · 484e32c · 484e32c
2 parents edffa89 + f86ec61
commit 484e32c
Show file tree

Hide file tree

Showing 33 changed files with 1,020 additions and 187 deletions.
diff --git a/enso/__init__.py b/enso/__init__.py
@@ -2,6 +2,7 @@
 from enso.featurize import Featurizer
 from enso.visualize import Visualizer
 from enso.sample import Sampler
+from enso.resample import Resampler
 from enso.metrics import Metric
 
 

diff --git a/enso/config.py b/enso/config.py
@@ -1,4 +1,5 @@
 import indicoio
+from enso.mode import ModeKeys
 import multiprocessing
 
 """Constants to configure the rest of Enso."""
@@ -13,40 +14,61 @@
 FEATURES_DIRECTORY = "Features"
 
 # Directory for storing experiment results
-EXPERIMENT_NAME = "Demo"
+EXPERIMENT_NAME = "TestSamplersSequence"
 
 # Datasets to featurize or run experiments on
 DATA = {
+    # Classification
     'Classify/AirlineSentiment',
     'Classify/MovieReviews',
     'Classify/MPQA',
     'Classify/PoliticalTweetSubjectivity',
+
+    # Seqence
+
+    # 'SequenceLabeling/Reuters-128',
+    # 'SequenceLabeling/brown_all',
+    # 'SequenceLabeling/brown_nouns',
+    # 'SequenceLabeling/brown_verbs',
+    # 'SequenceLabeling/brown_pronouns',
+    # 'SequenceLabeling/brown_adverbs',
 }
 
 # Featurizers to activate
 FEATURIZERS = {
+    "PlainTextFeaturizer",
+    # "IndicoStandard",
     "SpacyGloveFeaturizer",
-    "SpacyCNNFeaturizer"
+    # "SpacyCNNFeaturizer",
 }
 
 # Experiments to run
 EXPERIMENTS = {
-    "LogisticRegressionCV",
+    # "FinetuneSequenceLabel",
+    # "IndicoSequenceLabel"
+    # "Finetune",
+    # "SpacyGlove"
+    "LogisticRegressionCV"
 }
 
 # Metrics to compute
 METRICS = {
+    # "OverlapAccuracy",
+    # "OverlapPrecision",
+    # "OverlapRecall",
     "Accuracy",
     "MacroRocAuc",
 }
 
 # Test setup metadata
 TEST_SETUP = {
-    "train_sizes": range(50, 550, 50),
-    "n_splits": 25,
+    "train_sizes": range(50, 500, 50),
+    "n_splits": 2,
+    # "samplers": ['RandomSequence', 'NoSampler'],
     "samplers": ['Random'],
     "sampling_size": .3,
-    "resamplers": ['RandomOverSampler']
+    # "resamplers": ["SequenceOverSampler", 'NoResampler']
+    "resamplers": ["NoResampler", "RandomOverSampler"]
 }
 
 # Visualizations to display
@@ -63,13 +85,18 @@
         'y_tile': 'Dataset',
         'x_axis': 'TrainSize',
         'y_axis': 'Result',
-        'lines': ['Experiment', 'Featurizer'],
+        'lines': ['Experiment', 'Featurizer', "Sampler", "Resampler"],
         'category': 'merge',
         'cv': 'mean',
         'filename': 'TestResult'
     }
 }
 
+MODE = ModeKeys.CLASSIFY
+
+N_GPUS = 1
+N_CORES = 1  # multiprocessing.cpu_count()
+
+FIX_REQUIREMENTS = True
 
-N_GPUS = 3
-N_CORES = 1 # multiprocessing.cpu_count()
+indicoio.config.api_key = ""
diff --git a/enso/download/__init__.py b/enso/download/__init__.py
@@ -5,11 +5,12 @@
 from bs4 import BeautifulSoup
 
 from enso import config
+from enso.mode import ModeKeys
 
 
-def generic_download(url, text_column, target_column, filename, save=True, task_type='Classify', text_transformation=None, target_transformation=None):
+def generic_download(url, text_column, target_column, filename, save=True, task_type=ModeKeys.CLASSIFY, text_transformation=None, target_transformation=None):
 
-    save_path = os.path.join(config.DATA_DIRECTORY, task_type, filename)
+    save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename)
     if os.path.exists(save_path):
         print("{} already downloaded, skipping...".format(filename))
         return
@@ -32,6 +33,5 @@ def generic_download(url, text_column, target_column, filename, save=True, task_
 
     return new_df
 
-
 def html_to_text(text):
     return BeautifulSoup(text, "html5lib").get_text()
diff --git a/enso/download/brown.py b/enso/download/brown.py
@@ -0,0 +1,75 @@
+import os
+import json
+import re
+
+import nltk
+from nltk.corpus import brown
+
+from tqdm import tqdm
+
+from enso import config
+from enso.mode import ModeKeys
+
+only_tags_with = "N"
+
+
+def label_preproc(label, regex, whole_tag=False):
+    for sublabel in reversed(re.split("[+,-]", label)):
+        if re.match(regex, sublabel):
+            if whole_tag:
+                return label
+            return sublabel
+    return None
+
+
+def brown_corpus_tags(task_name, tag_regex, whole_tag=False):
+    task_type = ModeKeys.SEQUENCE
+    filename = "brown_{}.json".format(task_name)
+    save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename)
+
+    docs = []
+    for tagged_sent in tqdm(brown.tagged_sents(), desc=task_name):
+        doc_text = ""
+        doc_annotations = []
+        last_label = []
+        for sub_str, label in tagged_sent:
+            label = label_preproc(label, tag_regex, whole_tag)
+
+            if doc_text:
+                doc_text += " "
+
+            doc_location = len(doc_text)
+            doc_text += sub_str
+            doc_end = len(doc_text)
+
+            if doc_annotations and label is not None and label == last_label:
+                doc_annotations[-1]["end"] = doc_end
+
+            elif label is not None:
+                doc_annotations.append(
+                    {
+                        "start": doc_location,
+                        "end": doc_location + len(sub_str),
+                        "label": label
+                    }
+                )
+
+            last_label = label
+        docs.append([doc_text, doc_annotations])
+
+    with open(save_path, "wt") as fp:
+        json.dump(docs, fp, indent=1)
+
+
+if __name__ == "__main__":
+    nltk.download("brown")
+    brown_tasks = [
+        ("nouns", r'^N[A-Z]*', False),
+        ("verbs", r'^V[A-Z]*', False),
+        ("adverbs", r'^R[A-Z]*', False),
+        ("pronouns", r'^P[A-Z]*', False),
+        ("all", r'[A-Z]*', False)
+    ]
+
+    for task in brown_tasks:
+        brown_corpus_tags(*task)
diff --git a/enso/download/reuters-128.py b/enso/download/reuters-128.py
@@ -0,0 +1,48 @@
+import os
+import requests
+
+from bs4 import BeautifulSoup as bs
+from bs4.element import Tag
+import json
+from nltk.tokenize import sent_tokenize
+
+from enso import config
+from enso.mode import ModeKeys
+
+if __name__ == "__main__":
+    task_type = ModeKeys.SEQUENCE
+    filename = "Reuters-128.json"
+    save_path = os.path.join(config.DATA_DIRECTORY, task_type.value, filename)
+    if os.path.exists(save_path):
+        print("{} already downloaded, skipping...".format(filename))
+    else:
+        url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
+        r = requests.get(url)
+        soup = bs(r.content.decode("utf-8"), "html5lib")
+        docs = []
+        for elem in soup.find_all("document"):
+            single_entry = ["", []]
+            for c in elem.find("textwithnamedentities").children:
+                if type(c) == Tag:
+                    sent_parts = sent_tokenize(c.text)
+                    if len(sent_parts) == 1:
+                        sent_parts = [c.text]
+
+                    for i, text in enumerate(sent_parts):
+                        if i == 1:
+                            docs.append(single_entry)
+                            single_entry = ["", []]
+
+                        if c.name == "namedentityintext":
+                            single_entry[1].append(
+                                {
+                                    "start": len(single_entry[0]),
+                                    "end": len(single_entry[0]) + len(c.text),
+                                    "label": "NAME"
+                                }
+                            )
+                        single_entry[0] += text
+
+            docs.append(single_entry)
+        with open(save_path, "wt") as fp:
+            json.dump(docs, fp, indent=1)
diff --git a/enso/experiment/NB.py b/enso/experiment/NB.py
@@ -5,7 +5,10 @@
 
 from enso.experiment import Experiment
 
+from enso.registry import Registry, ModeKeys
 
+
+@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "not PlainTextFeaturizer")])
 class NaiveGaussianBayes(Experiment):
     """Basic implementation of a grid-search optimized Logistic Regression."""
 
@@ -25,6 +28,8 @@ def predict(self, dataset):
         probabilities = self.active_model.predict_proba(dataset)
         return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})
 
+
+@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "not PlainTextFeaturizer")])
 class NaiveMultinomialBayes(Experiment):
     """Basic implementation of a grid-search optimized Logistic Regression."""
 
@@ -48,6 +53,8 @@ def predict(self, dataset):
         probabilities = self.active_model.predict_proba(dataset)
         return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})
 
+
+@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "not PlainTextFeaturizer")])
 class NativeBernoulliBayes(Experiment):
     """Basic implementation of a grid-search optimized Logistic Regression."""
 
@@ -69,4 +76,4 @@ def predict(self, dataset):
         """Predict results on test set based on current internal model."""
         labels = self.active_model.classes_
         probabilities = self.active_model.predict_proba(dataset)
-        return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})
+        return pd.DataFrame({label: probabilities[:, i] for i, label in enumerate(labels)})