JohnSnowLabs · ArshaanNazir · Aug 1, 2023 · Jul 26, 2023 · Jul 26, 2023 · Jul 26, 2023
diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py
@@ -4,7 +4,7 @@
 import os
 import re
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import jsonlines
 import pandas as pd
@@ -855,7 +855,7 @@ def export_data(self, data: List[Sample], output_path: str):
 class HuggingFaceDataset(_IDataset):
     """Example dataset class that loads data using the Hugging Face dataset library."""
 
-    supported_tasks = ["text-classification", "summarization"]
+    supported_tasks = ["text-classification", "summarization", "ner"]
 
     LIB_NAME = "datasets"
     COLUMN_NAMES = {task: COLUMN_MAPPER[task] for task in supported_tasks}
@@ -886,11 +886,41 @@ def _check_datasets_package(self):
                 f"The '{self.LIB_NAME}' package is not installed. Please install it using 'pip install {self.LIB_NAME}'."
             )
 
+    def load_data_ner(
+        self,
+        feature_column: str = None,
+        target_column: str = None,
+        split: str = None,
+        subset: str = None,
+    ) -> List[Sample]:
+        """Load the specified split from the given ner dataset."""
+        feature_column = "text" if feature_column is None else feature_column
+        target_column = "label" if target_column is None else target_column
+        split = "test" if split is None else split
+
+        if subset:
+            dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
+        else:
+            dataset = self.load_dataset(self.dataset_name, split=split)
+
+        label_names = dataset.features[target_column].feature.names
+
+        dataset = map(
+            lambda example: {
+                "tokens": example[feature_column],
+                "ner_tags": [label_names[x] for x in example[target_column]],
+            },
+            dataset,
+        )
+
+        samples = [self._row_to_ner_sample(example) for example in dataset]
+        return samples
+
     def load_data_classification(
         self,
-        feature_column: str = "text",
-        target_column: str = "label",
-        split: str = "test",
+        feature_column: str = None,
+        target_column: str = None,
+        split: str = None,
         subset: str = None,
     ) -> List[Sample]:
         """Load the specified split from the dataset library.
@@ -909,30 +939,33 @@ def load_data_classification(
             List[Sample]:
                 Loaded split as a list of Sample objects.
         """
+        feature_column = "text" if feature_column is None else feature_column
+        target_column = "label" if target_column is None else target_column
+        split = "test" if split is None else split
+
         if subset:
             dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
         else:
             dataset = self.load_dataset(self.dataset_name, split=split)
 
-        if feature_column and target_column:
-            dataset = dataset.map(
-                lambda example: {
-                    "text": example[feature_column],
-                    "label": example[target_column],
-                }
-            )
+        dataset = dataset.map(
+            lambda example: {
+                "text": example[feature_column],
+                "label": example[target_column],
+            }
+        )
 
         samples = [self._row_to_sample_classification(example) for example in dataset]
         return samples
 
     def load_data_summarization(
         self,
-        feature_column: str = "document",
-        target_column: str = "summary",
-        split: str = "test",
+        feature_column: str,
+        target_column: str,
+        split: str,
         subset: str = None,
     ) -> List[Sample]:
-        """Load the specified split from the dataset library for summarization task.
+        """Load the specified split from the dataset for summarization task.
 
         Args:
             feature_column (str):
@@ -948,6 +981,10 @@ def load_data_summarization(
             List[Sample]:
                 Loaded split as a list of Sample objects for summarization task.
         """
+        feature_column = "document" if feature_column is None else feature_column
+        target_column = "summary" if target_column is None else target_column
+        split = "test" if split is None else split
+
         if subset:
             dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
         else:
@@ -981,10 +1018,10 @@ def load_raw_data(
 
     def load_data(
         self,
-        feature_column: str = "text",
-        target_column: str = "label",
-        split: str = "test",
-        subset: str = None,
+        feature_column: Optional[str] = None,
+        target_column: Optional[str] = None,
+        split: Optional[str] = None,
+        subset: Optional[str] = None,
     ) -> List[Sample]:
         """Load the specified data based on the task.
 
@@ -1014,8 +1051,10 @@ def load_data(
             return self.load_data_summarization(
                 feature_column, target_column, split, subset
             )
+        elif self.task == "ner":
+            return self.load_data_ner(feature_column, target_column, split, subset)
         else:
-            raise ValueError(f"Unsupported task: {self.task}")
+            raise ValueError(f"Unsupported task for HF datasets: {self.task}")
 
     @staticmethod
     def _row_to_sample_summarization(data_row: Dict[str, str]) -> Sample:
@@ -1088,3 +1127,50 @@ def _row_to_sample_classification(self, data_row: Dict[str, str]) -> Sample:
             original=original,
             expected_results=SequenceClassificationOutput(predictions=[label]),
         )
+
+    def _row_to_ner_sample(self, data_row: dict) -> Sample:
+        """Convert a row from the dataset into a Sample for NER.
+
+        Args:
+            data_row (Dict[str, str]):
+                Single row of the dataset.
+
+        Returns:
+            Sample:
+                Row formatted into a Sample object.
+        """
+        input_column = next(
+            (col for col in self.COLUMN_NAMES["ner"]["text"] if col in data_row),
+            None,
+        )
+        output_column = next(
+            (col for col in self.COLUMN_NAMES["ner"]["ner"] if col in data_row),
+            None,
+        )
+
+        tokens = data_row.get(input_column, [])
+        labels = data_row.get(output_column, [])
+
+        #  get token and labels from the split
+        ner_labels = []
+        cursor = 0
+        for token, label in zip(tokens, labels):
+            ner_labels.append(
+                NERPrediction.from_span(
+                    entity=label,
+                    word=token,
+                    start=cursor,
+                    end=cursor + len(token),
+                    doc_id=0,
+                    doc_name="",
+                    pos_tag="XX",
+                    chunk_tag="XX",
+                )
+            )
+            # +1 to account for the white space
+            cursor += len(token) + 1
+
+        original = " ".join(tokens)
+        return NERSample(
+            original=original, expected_results=NEROutput(predictions=ner_labels)
+        )
diff --git a/langtest/langtest.py b/langtest/langtest.py
@@ -60,6 +60,7 @@ class Harness:
             "johnsnowlabs",
         ): "imdb/sample.csv",
     }
+    SUPPORTED_HUBS_HF_DATASET_NER = ["johnsnowlabs", "huggingface", "spacy"]
     SUPPORTED_HUBS_HF_DATASET_CLASSIFICATION = ["johnsnowlabs", "huggingface", "spacy"]
     SUPPORTED_HUBS_HF_DATASET_SUMMARIZATION = [
         "openai",
@@ -141,7 +142,7 @@ def __init__(
             logging.info("Default dataset '%s' successfully loaded.", (task, model, hub))
 
         elif (
-            type(data) is dict
+            isinstance(data, dict)
             and hub in self.SUPPORTED_HUBS_HF_DATASET_CLASSIFICATION
             and task == "text-classification"
         ):
@@ -164,7 +165,19 @@ def __init__(
                 model = resource_filename("langtest", "data/textcat_imdb")
 
         elif (
-            type(data) is dict
+            isinstance(data, dict)
+            and hub in self.SUPPORTED_HUBS_HF_DATASET_NER
+            and task == "ner"
+        ):
+            self.data = HuggingFaceDataset(data["name"], task=task).load_data(
+                feature_column=data.get("feature_column", "tokens"),
+                target_column=data.get("target_column", "ner_tags"),
+                split=data.get("split", "test"),
+                subset=data.get("subset", None),
+            )
+
+        elif (
+            isinstance(data, dict)
             and hub in self.SUPPORTED_HUBS_HF_DATASET_SUMMARIZATION
             and task == "summarization"
         ):

diff --git a/tests/test_datasource.py b/tests/test_datasource.py
@@ -69,15 +69,24 @@ def test_load_raw_data(self, dataset, feature_col, target_col):
                 assert isinstance(label, str)
 
     @pytest.mark.parametrize(
-        "dataset",
+        "dataset,params",
         [
-            CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"),
-            ConllDataset(file_path="tests/fixtures/test.conll", task="ner"),
+            (
+                HuggingFaceDataset(dataset_name="wikiann", task="ner"),
+                {
+                    "subset": "fo",
+                    "feature_column": "tokens",
+                    "target_column": "ner_tags",
+                    "split": "test",
+                },
+            ),
+            (CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"), {}),
+            (ConllDataset(file_path="tests/fixtures/test.conll", task="ner"), {}),
         ],
     )
-    def test_load_data(self, dataset):
+    def test_load_data(self, dataset, params):
         """"""
-        samples = dataset.load_data()
+        samples = dataset.load_data(**params)
 
         assert isinstance(samples, list)
 
@@ -165,7 +174,7 @@ def test_load_raw_data(self, dataset, feature_col, target_col):
     def test_load_data(self, dataset, feature_col, target_col):
         """"""
         if isinstance(dataset, HuggingFaceDataset):
-            samples = dataset.load_data(split="test[:30]")
+            samples = dataset.load_data(split="test")
         else:
             samples = dataset.load_data()