Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pytest for fairness class #682

Merged
merged 7 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions langtest/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,9 +842,9 @@ def transform(self) -> List[Sample]:
lambda x: x.split("-")[-1] if isinstance(x, str) else x
)
y_true = y_true.dropna()
params["test_name"] = test_name

transformed_samples = self.supported_tests[test_name].transform(
y_true, params
test_name, y_true, params
)
end_time = time.time_ns()
for sample in transformed_samples:
Expand Down
54 changes: 41 additions & 13 deletions langtest/transform/fairness.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,26 @@ class MinGenderF1Score(BaseFairness):
Transforms the input data into an output based on the minimum F1 score.
"""

alias_name = "min_gender_f1_score"
alias_name = ["min_gender_f1_score"]

@staticmethod
def transform(data: List[Sample], params: Dict) -> List[MinScoreSample]:
@classmethod
def transform(
cls, test: str, data: List[Sample], params: Dict
) -> List[MinScoreSample]:
"""Computes the minimum F1 score for the given data.

Args:
test (str): name of the test
data (List[Sample]): The input data to be transformed.
params (Dict): parameters for tests configuration
params (Dict): parameters for tests configuration.
Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

assert (
test in cls.alias_name
), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'"

if isinstance(params["min_score"], dict):
min_scores = params["min_score"]
elif isinstance(params["min_score"], float):
Expand Down Expand Up @@ -163,18 +171,24 @@ class MaxGenderF1Score(BaseFairness):
Transforms the input data into an output based on the maximum F1 score.
"""

alias_name = "max_gender_f1_score"
alias_name = ["max_gender_f1_score"]

@staticmethod
def transform(data: List[Sample], params: Dict) -> List[MaxScoreSample]:
@classmethod
def transform(
cls, test: str, data: List[Sample], params: Dict
) -> List[MaxScoreSample]:
"""Computes the maximum F1 score for the given data.

Args:
test (str): name of the test.
data (List[Sample]): The input data to be transformed.
params (Dict): parameters for tests configuration
Returns:
List[MaxScoreSample]: The transformed data based on the maximum F1 score.
"""
assert (
test in cls.alias_name
), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'"
if isinstance(params["max_score"], dict):
max_scores = params["max_score"]
elif isinstance(params["max_score"], float):
Expand Down Expand Up @@ -250,16 +264,23 @@ class MinGenderRougeScore(BaseFairness):
]
supported_tasks = ["question-answering", "summarization"]

@staticmethod
def transform(data: List[Sample], params: Dict) -> List[MinScoreSample]:
@classmethod
def transform(
cls, test: str, data: List[Sample], params: Dict
) -> List[MinScoreSample]:
"""Computes the min rouge score for the given data.

Args:
test (str): name of the test.
data (List[Sample]): The input data to be transformed.
params (Dict): parameters for tests configuration
Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""
assert (
test in cls.alias_name
), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'"

if isinstance(params["min_score"], dict):
min_scores = params["min_score"]
elif isinstance(params["min_score"], float):
Expand All @@ -274,7 +295,7 @@ def transform(data: List[Sample], params: Dict) -> List[MinScoreSample]:
sample = MinScoreSample(
original=None,
category="fairness",
test_type=params["test_name"],
test_type=test,
test_case=key,
expected_results=MinScoreOutput(min_score=val),
)
Expand Down Expand Up @@ -343,16 +364,23 @@ class MaxGenderRougeScore(BaseFairness):
]
supported_tasks = ["question-answering", "summarization"]

@staticmethod
def transform(data: List[Sample], params: Dict) -> List[MaxScoreSample]:
@classmethod
def transform(
cls, test: str, data: List[Sample], params: Dict
) -> List[MaxScoreSample]:
"""Computes the rouge score for the given data.

Args:
test (str): name of the test.
data (List[Sample]): The input data to be transformed.
params (Dict): parameters for tests configuration
Returns:
List[MaxScoreSample]: The transformed data based on the rouge score.
"""
assert (
test in cls.alias_name
), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'"

if isinstance(params["max_score"], dict):
max_scores = params["max_score"]
elif isinstance(params["max_score"], float):
Expand All @@ -367,7 +395,7 @@ def transform(data: List[Sample], params: Dict) -> List[MaxScoreSample]:
sample = MaxScoreSample(
original=None,
category="fairness",
test_type=params["test_name"],
test_type=test,
test_case=key,
expected_results=MaxScoreOutput(max_score=val),
)
Expand Down
151 changes: 151 additions & 0 deletions tests/test_fairness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import pytest

from langtest.transform.fairness import (
BaseFairness,
MinGenderF1Score,
MaxGenderF1Score,
MinGenderRougeScore,
MaxGenderRougeScore,
)
from langtest.utils.custom_types import SequenceLabel, Span
from langtest.utils.custom_types.output import (
NEROutput,
NERPrediction,
SequenceClassificationOutput,
TranslationOutput,
)
from langtest.utils.custom_types.sample import (
MinScoreQASample,
MaxScoreQASample,
MaxScoreSample,
MinScoreSample,
NERSample,
QASample,
SequenceClassificationSample,
SummarizationSample,
ToxicitySample,
TranslationSample,
)


class Testfairness:
"""A test suite for evaluating the transformation process of various fairnesss.

This test suite ensures that the fairnesss can successfully transform input data
and produce valid results.

The fairnesss tested include Genderfairness, Ethnicityfairness,
Religionfairness, and CountryEconomicfairness.

Attributes:
fairness_config (Dict)
"""

fairness_config = {
"min_gender_f1_score": {"min_score": 0.66},
"max_gender_f1_score": {"max_score": 0.60},
"min_gender_rouge1_score": {"min_score": 0.66},
"min_gender_rouge2_score": {"min_score": 0.60},
"min_gender_rougeL_score": {"min_score": 0.66},
"min_gender_rougeLsum_score": {"min_score": 0.66},
"max_gender_rouge1_score": {"max_score": 0.66},
"max_gender_rouge2_score": {"max_score": 0.60},
"max_gender_rougeL_score": {"max_score": 0.66},
"max_gender_rougeLsum_score": {"max_score": 0.66},
}

@pytest.fixture
def sample_data(self):
"""A fixture providing sample data for the fairness transformation tests.

Returns:
list: A list containing sample SequenceClassificationSample instances.
"""
return {
"text-classification": [
SequenceClassificationSample(
original="The last good ernest movie, and the best at that. how can you not laugh at least once at this movie. the last line is a classic, as is ernest's gangster impressions, his best moment on film. this has his best lines and is a crowning achievement among the brainless screwball comedies.",
expected_results=SequenceClassificationOutput(
predictions=[SequenceLabel(label="Positive", score=1.0)]
),
),
SequenceClassificationSample(
original="After my 6 year old daughter began taking riding lessons I started looking for horse movies for her. I had always heard of National Velvet but had never seen it. Boy am I glad I bought it! It's become a favorite of mine, my 6 year old AND my 2 year old. It's a shame movies like this aren't made anymore.",
expected_results=SequenceClassificationOutput(
predictions=[SequenceLabel(label="Positive", score=1.0)]
),
),
],
"ner": [
NERSample(
original="Attendance : 3,000",
expected_results=NEROutput(
predictions=[
NERPrediction(
entity="CARDINAL",
span=Span(start=13, end=18, word="3,000"),
)
]
),
),
NERSample(
original="I do not love KFC",
expected_results=NEROutput(
predictions=[
NERPrediction(
entity="PROD", span=Span(start=14, end=17, word="KFC")
)
]
),
),
],
"question-answering": [
QASample(
original_question="What is John Snow Labs?",
original_context="John Snow Labs is a healthcare company specializing in accelerating progress in data science.",
expected_results="A healthcare company specializing in accelerating progress in data science. ",
)
],
"summarization": [
SummarizationSample(
original="John Snow Labs is a healthcare company specializing in accelerating progress in data "
"science.",
expected_results="JSL is a data science company",
)
],
}

@pytest.mark.parametrize(
"fairness",
[
MinGenderF1Score,
MaxGenderF1Score,
MinGenderRougeScore,
MaxGenderRougeScore,
],
)
def test_transform(self, fairness: BaseFairness, sample_data) -> None:
"""
Test case for fairness classes.

Args:
fairness (Type[fairness]): The fairness class to be tested.
sample_data (List]): A list containing sample instances.

Returns:
None

Raises:
AssertionError: If the transformation or the final result is invalid.
"""
for alias in fairness.alias_name:
for task in fairness.supported_tasks:
transform_results = fairness.transform(
alias, sample_data[task], self.fairness_config[alias]
)
assert isinstance(transform_results, list)

for _, result in zip(sample_data, transform_results):
assert isinstance(result, MaxScoreSample) or isinstance(
result, MinScoreSample
)