## AdvGlue Evaluation Suite

This HuggingFace `evaluate.EvaluationSuite` compares the GLUE results with Adversarial GLUE (AdvGLUE), a multi-task benchmark meauree the vulnerabilities of modern large-scal 
language models under various types of adversarial attacks.

```
@inproceedings{wang2021adversarial,
title={Adversarial GLUE: A Multi-Task Benchmark for Robustness Evaluation of Language Models},
author={Wang, Boxin and Xu, Chejian and Wang, Shuohang and Gan, Zhe and Cheng, Yu and Gao, Jianfeng and Awadallah, Ahmed Hassan and Li, Bo},
booktitle={Advances in Neural Information Processing Systems},
year={2021}
}
```

Let us begin by writing our custom `EvaluateSuite` to evaluate hugging face's `glue` metric both against `glue` and `adv_glue` datasets.

In [None]:
%%writefile glue_with_class.py

import evaluate
from evaluate import EvaluationSuite, push_to_hub
from evaluate.evaluation_suite import SubTask
from evaluate.visualization import radar_plot
from intel_evaluate_extension.model_card_suite import ModelCardSuiteResults

_HEADER =  "GLUE/AdvGlue Evaluation Results"

_DESCRIPTION = """
The suite compares the GLUE results with Adversarial GLUE (AdvGLUE), a multi-task benchmark 
meaure the vulnerabilities of modern large-scale language models under various types of adversarial attacks."""
        
class Suite(ModelCardSuiteResults):
    def __init__(self, name):
        super().__init__(name)
        self.preprocessor = lambda x: {"text": x["text"].lower()}
        self.result_keys = ["accuracy", "f1"]
        self.summary = _DESCRIPTION
        self.header = _HEADER
        self.suite = [
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="sst2",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence",
                    "label_column": "label",
                    "config_name": "sst2",
                    "label_mapping": {
                        "LABEL_0": 0.0,
                        "LABEL_1": 1.0
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="adv_glue",
                subset="adv_sst2",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence",
                    "label_column": "label",
                    "config_name": "sst2",
                    "label_mapping": {
                        "LABEL_0": 0.0,
                        "LABEL_1": 1.0
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="qqp",
                split="validation[:5]",

                args_for_task={
                    "metric": "glue",
                    "input_column": "question1",
                    "second_input_column": "question2",
                    "label_column": "label",
                    "config_name": "qqp",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="adv_glue",
                subset="adv_qqp",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "question1",
                    "second_input_column": "question2",
                    "label_column": "label",
                    "config_name": "qqp",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="qnli",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "question",
                    "second_input_column": "sentence",
                    "label_column": "label",
                    "config_name": "qnli",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="adv_glue",
                subset="adv_qnli",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "question",
                    "second_input_column": "sentence",
                    "label_column": "label",
                    "config_name": "qnli",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="rte",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence1",
                    "second_input_column": "sentence2",
                    "label_column": "label",
                    "config_name": "rte",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="adv_glue",
                subset="adv_rte",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "sentence1",
                    "second_input_column": "sentence2",
                    "label_column": "label",
                    "config_name": "rte",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="glue",
                subset="mnli",
                split="validation_mismatched[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "premise",
                    "second_input_column": "hypothesis",
                    "config_name": "mnli",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1,
                        "LABEL_2": 2
                    }
                }
            ),
            SubTask(
                task_type="text-classification",
                data="adv_glue",
                subset="adv_mnli",
                split="validation[:5]",
                args_for_task={
                    "metric": "glue",
                    "input_column": "premise",
                    "second_input_column": "hypothesis",
                    "config_name": "mnli",
                    "label_mapping": {
                        "LABEL_0": 0,
                        "LABEL_1": 1,
                        "LABEL_2": 2
                    }
                }
            ),
        ]

    def process_results(self, results):
        radar_data = [
            {"accuracy " + result["task_name"].split("/")[-1]: 
             result["accuracy"] for result in results[::2]},
            {"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: 
             result["accuracy"] for result in results[1::2]}]
        return radar_plot(radar_data, ['GLUE', 'AdvGLUE'])

    def plot_results(self, results, model_or_pipeline):
        radar_data = self.process_results(results)
        graphic = radar_plot(radar_data, ['GLUE ' + model_or_pipeline,  'AdvGLUE ' + model_or_pipeline])
        return graphic


## Run EvaluationSuite to Compare GLUE/AdvGlue Results

Now we run our `EvaluationSuite` on the `gpt2` model.

In [None]:
%%time
from evaluate import EvaluationSuite

suite = EvaluationSuite.load('glue_with_class.py')
mc_results = suite.run("gpt2")

## Create a Custom Model Card Markdown Template for Results

In [None]:
%%writefile custom_template.md
---
{{ card_data }}
---

# Model Card for MyCoolModel

This model does this and that.

## Results

{% for result in results.collection %}
{{result.header}}

{{result.table}}

{{result.graphic}}

{% endfor %}

## Model Card Contact

This model was created by [@{{ author }}](https://hf.co/{{author}}).

## Push Evaluation Results to Model Card on Hugging Face Hub

In [None]:
from huggingface_hub import ModelCard, ModelCardData, EvalResult

hub_path = "..."

# Download Card
card = ModelCard.load(hub_path)
# Generate new model card
card = ModelCard.from_template(card.data, 'custom_template.md', results=mc_results)
# Push model to hub
card.push_to_hub(hub_path)