Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NER support for HF datasets #673

Merged
merged 12 commits into from
Aug 1, 2023
128 changes: 107 additions & 21 deletions langtest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import re
from abc import ABC, abstractmethod
from typing import Dict, List
from typing import Dict, List, Optional

import jsonlines
import pandas as pd
Expand Down Expand Up @@ -855,7 +855,7 @@ def export_data(self, data: List[Sample], output_path: str):
class HuggingFaceDataset(_IDataset):
"""Example dataset class that loads data using the Hugging Face dataset library."""

supported_tasks = ["text-classification", "summarization"]
supported_tasks = ["text-classification", "summarization", "ner"]

LIB_NAME = "datasets"
COLUMN_NAMES = {task: COLUMN_MAPPER[task] for task in supported_tasks}
Expand Down Expand Up @@ -886,11 +886,41 @@ def _check_datasets_package(self):
f"The '{self.LIB_NAME}' package is not installed. Please install it using 'pip install {self.LIB_NAME}'."
)

def load_data_ner(
self,
feature_column: str = None,
target_column: str = None,
split: str = None,
alytarik marked this conversation as resolved.
Show resolved Hide resolved
subset: str = None,
) -> List[Sample]:
"""Load the specified split from the given ner dataset."""
feature_column = "text" if feature_column is None else feature_column
target_column = "label" if target_column is None else target_column
split = "test" if split is None else split

if subset:
dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
else:
dataset = self.load_dataset(self.dataset_name, split=split)

label_names = dataset.features[target_column].feature.names

dataset = map(
lambda example: {
"tokens": example[feature_column],
"ner_tags": [label_names[x] for x in example[target_column]],
},
dataset,
)

samples = [self._row_to_ner_sample(example) for example in dataset]
return samples

def load_data_classification(
self,
feature_column: str = "text",
target_column: str = "label",
split: str = "test",
feature_column: str = None,
target_column: str = None,
split: str = None,
alytarik marked this conversation as resolved.
Show resolved Hide resolved
subset: str = None,
) -> List[Sample]:
"""Load the specified split from the dataset library.
Expand All @@ -909,30 +939,33 @@ def load_data_classification(
List[Sample]:
Loaded split as a list of Sample objects.
"""
feature_column = "text" if feature_column is None else feature_column
target_column = "label" if target_column is None else target_column
split = "test" if split is None else split

if subset:
dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
else:
dataset = self.load_dataset(self.dataset_name, split=split)

if feature_column and target_column:
dataset = dataset.map(
lambda example: {
"text": example[feature_column],
"label": example[target_column],
}
)
dataset = dataset.map(
lambda example: {
"text": example[feature_column],
"label": example[target_column],
}
)

samples = [self._row_to_sample_classification(example) for example in dataset]
return samples

def load_data_summarization(
self,
feature_column: str = "document",
target_column: str = "summary",
split: str = "test",
feature_column: str,
target_column: str,
split: str,
subset: str = None,
) -> List[Sample]:
"""Load the specified split from the dataset library for summarization task.
"""Load the specified split from the dataset for summarization task.

Args:
feature_column (str):
Expand All @@ -948,6 +981,10 @@ def load_data_summarization(
List[Sample]:
Loaded split as a list of Sample objects for summarization task.
"""
feature_column = "document" if feature_column is None else feature_column
target_column = "summary" if target_column is None else target_column
split = "test" if split is None else split

if subset:
dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
else:
Expand Down Expand Up @@ -981,10 +1018,10 @@ def load_raw_data(

def load_data(
self,
feature_column: str = "text",
target_column: str = "label",
split: str = "test",
subset: str = None,
feature_column: Optional[str] = None,
target_column: Optional[str] = None,
split: Optional[str] = None,
subset: Optional[str] = None,
) -> List[Sample]:
"""Load the specified data based on the task.

Expand Down Expand Up @@ -1014,8 +1051,10 @@ def load_data(
return self.load_data_summarization(
feature_column, target_column, split, subset
)
elif self.task == "ner":
return self.load_data_ner(feature_column, target_column, split, subset)
else:
raise ValueError(f"Unsupported task: {self.task}")
raise ValueError(f"Unsupported task for HF datasets: {self.task}")

@staticmethod
def _row_to_sample_summarization(data_row: Dict[str, str]) -> Sample:
Expand Down Expand Up @@ -1088,3 +1127,50 @@ def _row_to_sample_classification(self, data_row: Dict[str, str]) -> Sample:
original=original,
expected_results=SequenceClassificationOutput(predictions=[label]),
)

def _row_to_ner_sample(self, data_row: dict) -> Sample:
"""Convert a row from the dataset into a Sample for NER.

Args:
data_row (Dict[str, str]):
Single row of the dataset.

Returns:
Sample:
Row formatted into a Sample object.
"""
input_column = next(
(col for col in self.COLUMN_NAMES["ner"]["text"] if col in data_row),
None,
)
output_column = next(
(col for col in self.COLUMN_NAMES["ner"]["ner"] if col in data_row),
None,
)

tokens = data_row.get(input_column, [])
labels = data_row.get(output_column, [])

# get token and labels from the split
ner_labels = []
cursor = 0
for token, label in zip(tokens, labels):
ner_labels.append(
NERPrediction.from_span(
entity=label,
word=token,
start=cursor,
end=cursor + len(token),
doc_id=0,
doc_name="",
pos_tag="XX",
chunk_tag="XX",
)
alytarik marked this conversation as resolved.
Show resolved Hide resolved
)
# +1 to account for the white space
cursor += len(token) + 1

original = " ".join(tokens)
return NERSample(
original=original, expected_results=NEROutput(predictions=ner_labels)
)
17 changes: 15 additions & 2 deletions langtest/langtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class Harness:
"johnsnowlabs",
): "imdb/sample.csv",
}
SUPPORTED_HUBS_HF_DATASET_NER = ["johnsnowlabs", "huggingface", "spacy"]
SUPPORTED_HUBS_HF_DATASET_CLASSIFICATION = ["johnsnowlabs", "huggingface", "spacy"]
SUPPORTED_HUBS_HF_DATASET_SUMMARIZATION = [
"openai",
Expand Down Expand Up @@ -141,7 +142,7 @@ def __init__(
logging.info("Default dataset '%s' successfully loaded.", (task, model, hub))

elif (
type(data) is dict
isinstance(data, dict)
and hub in self.SUPPORTED_HUBS_HF_DATASET_CLASSIFICATION
and task == "text-classification"
):
Expand All @@ -164,7 +165,19 @@ def __init__(
model = resource_filename("langtest", "data/textcat_imdb")

elif (
type(data) is dict
isinstance(data, dict)
and hub in self.SUPPORTED_HUBS_HF_DATASET_NER
and task == "ner"
):
self.data = HuggingFaceDataset(data["name"], task=task).load_data(
feature_column=data.get("feature_column", "tokens"),
target_column=data.get("target_column", "ner_tags"),
split=data.get("split", "test"),
subset=data.get("subset", None),
)
alytarik marked this conversation as resolved.
Show resolved Hide resolved

elif (
isinstance(data, dict)
and hub in self.SUPPORTED_HUBS_HF_DATASET_SUMMARIZATION
and task == "summarization"
):
Expand Down
21 changes: 15 additions & 6 deletions tests/test_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,24 @@ def test_load_raw_data(self, dataset, feature_col, target_col):
assert isinstance(label, str)

@pytest.mark.parametrize(
"dataset",
"dataset,params",
[
CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"),
ConllDataset(file_path="tests/fixtures/test.conll", task="ner"),
(
HuggingFaceDataset(dataset_name="wikiann", task="ner"),
{
"subset": "fo",
"feature_column": "tokens",
"target_column": "ner_tags",
"split": "test",
},
),
(CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"), {}),
(ConllDataset(file_path="tests/fixtures/test.conll", task="ner"), {}),
],
)
def test_load_data(self, dataset):
def test_load_data(self, dataset, params):
""""""
samples = dataset.load_data()
samples = dataset.load_data(**params)

assert isinstance(samples, list)

Expand Down Expand Up @@ -165,7 +174,7 @@ def test_load_raw_data(self, dataset, feature_col, target_col):
def test_load_data(self, dataset, feature_col, target_col):
""""""
if isinstance(dataset, HuggingFaceDataset):
samples = dataset.load_data(split="test[:30]")
samples = dataset.load_data(split="test")
else:
JulesBelveze marked this conversation as resolved.
Show resolved Hide resolved
samples = dataset.load_data()

Expand Down