## Spark

In [16]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

spark = sparknlp.start()
spark



In [17]:
documentAssembler = DocumentAssembler()\
		.setInputCol("text")\
		.setOutputCol("document")

tokenizer = Tokenizer()\
		.setInputCols(["document"])\
		.setOutputCol("token")
	
embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
		.setInputCols(["document", 'token']) \
		.setOutputCol("embeddings")

ner = NerDLModel.pretrained("ner_dl", 'en') \
		.setInputCols(["document", "token", "embeddings"]) \
		.setOutputCol("ner")

ner_pipeline = Pipeline().setStages([
				documentAssembler,
				tokenizer,
				embeddings,
				ner
    ])
ner_model = ner_pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [18]:
from nlptest import Harness

h3 = Harness(task="ner", model=ner_model, hub="johnsnowlabs", data="demo/data/test.conll")
h3.configure({
    'tasks': ['ner'],
    'tests_types': ['uppercase', 'lowercase'],
    'min_pass_rate':{'default':0.5}
})

{'tasks': ['ner'],
 'tests_types': ['uppercase', 'lowercase'],
 'min_pass_rate': {'default': 0.5}}

In [19]:
h3.generate().run().report()

uppercase
lowercase
25 predictions have different lenghts than dataset and will be ignored.
Please make sure dataset and model uses same tokenizer.


Unnamed: 0,test_type,fail_count,pass_count,pass_rate,minimum_pass_rate,pass
0,uppercase,16.0,75.0,82%,50%,True
1,lowercase,77.0,14.0,15%,50%,False
0,micro-f1,-,-,99%,50%,True
1,macro-f1,-,-,88%,50%,True


## Gender f1 score

In [20]:
import pandas as pd

In [21]:
class RuleBasedGenderClassifier():
    """Rule based gender classifier for bias tests.
    """
    def predict(self, text: str):
        """Predict the gender of given input sentence
        """
        text = text.lower().split()
        female_entities = ['she', 'her', 'hers', 'herself', 'girl', 'girls', 'woman', 'women',
                        'madam', 'madame', 'lady', 'miss', 'mrs', 'female', 'breast', 
                        'ovary', 'ovarian', 'vagina']
        male_entities = ['he', 'his', 'him', 'himself', 'boy', 'man', 'men', 'sir', 'gentleman', 'mr',
                        'male', 'prostate', 'testicle', 'testicular', 'penis']
        female_count = sum([text.count(ent) for ent in female_entities])
        male_count = sum([text.count(ent) for ent in male_entities])

        if female_count > male_count:
            return "F"
        elif male_count > female_count:
            return "M"
        else:
            return "N"

In [14]:
def get_gendered_data(data):
    data = pd.Series(data)
    sentences = pd.Series([x.original for x in data])
    genders = sentences.apply(RuleBasedGenderClassifier().predict)
    gendered_data = {
        "male": data[genders == "M"].tolist(),
        "female": data[genders == "F"].tolist(),
        "neutral": data[genders == "N"].tolist(),
    }
    return gendered_data

In [25]:
from nlptest.modelhandler.modelhandler import ModelFactory
from sklearn.metrics import f1_score

gendered_data = get_gendered_data(h3.data)
m = ModelFactory(ner_model,"ner")

for key in gendered_data:
    y_true = pd.Series(gendered_data[key]).apply(lambda x: [y.entity for y in x.expected_results.predictions])
    X_test = pd.Series(gendered_data[key]).apply(lambda x: x.original)
    y_pred = X_test.apply(m.predict_raw)
    
    valid_indices = y_true.apply(len) == y_pred.apply(len)
    length_mismatch = valid_indices.count()-valid_indices.sum()
    # if length_mismatch > 0:
        # print(f"{length_mismatch} predictions have different lenghts than dataset and will be ignored.\nPlease make sure dataset and model uses same tokenizer.")
    y_true = y_true[valid_indices]
    y_pred = y_pred[valid_indices]

    y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
    y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])

    micro_f1_score = f1_score(y_true, y_pred, average="micro")
    macro_f1_score = f1_score(y_true, y_pred, average="macro")

    gendered_data[key] = pd.DataFrame({
        "test_case": [key, key],
        "test_type": ["micro-f1", "macro-f1"],
        "actual_result": [micro_f1_score, macro_f1_score]
    })

pd.concat(gendered_data.values(), ignore_index=True)

9    30
dtype: int64
9    31
dtype: int64
Series([], dtype: object)
Series([], dtype: object)
12    31
31    31
33    22
35    25
42     8
dtype: int64
12    32
31    32
33    23
35    26
42     9
dtype: int64


Unnamed: 0,test_case,test_type,actual_result
0,male,micro-f1,0.99505
1,male,macro-f1,0.971522
2,female,micro-f1,1.0
3,female,macro-f1,1.0
4,neutral,micro-f1,0.989831
5,neutral,macro-f1,0.883046
