IBM · shachardon · Sep 26, 2023 · Oct 30, 2023
diff --git a/prepare/cards/natural_instructions.py b/prepare/cards/natural_instructions.py
@@ -0,0 +1,58 @@
+import os.path
+
+import numpy as np
+from datasets import load_dataset
+from src.unitxt.blocks import InputOutputTemplate, LoadHF, SplitRandomMix, TemplatesList
+from src.unitxt.card import TaskCard
+from src.unitxt.catalog import add_to_catalog
+from src.unitxt.instructions import InstructionsList, TextualInstruction
+from src.unitxt.operators import CopyFields, FilterByValues
+from src.unitxt.prepare_utils.card_types import addClassificationChoices
+from src.unitxt.task import FormTask
+from src.unitxt.test_utils.card import test_card
+
+hf_df = load_dataset("Muennighoff/natural-instructions")
+tasks_names = []
+for split in ["train"]:
+    # for split in ["train", "validation", "test"]:
+    names = np.unique(hf_df[split]["task_name"])
+    tasks_names.append(names)
+
+    pandas_df_split = hf_df[split].to_pandas()
+
+    for task in names:
+        print("task name:", task)
+        json_url = f"https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/{task}.json"
+        definitions = pandas_df_split[pandas_df_split["task_name"] == task]["definition"].unique().tolist()
+        assert len(definitions) == 1
+        print(definitions)
+        task_instruction = definitions[0]
+        task = task.replace("-", "_")
+
+        if os.path.isfile(
+            f"/u/shachardon/repo/unitxt/src/unitxt/catalog/cards/natural_instructions/{split}/{task}.json"
+        ):
+            print("already exists. skipping")
+            continue
+
+        instruction = TextualInstruction(task_instruction)
+        add_to_catalog(instruction, f"instructions.natural_instructions.{split}.{task}", overwrite=True)
+
+        card = TaskCard(
+            loader=LoadHF("json", data_files=json_url, field="Instances"),
+            preprocess_steps=[
+                SplitRandomMix({"train": "train[90%]", "validation": "train[5%]", "test": "train[5%]"}),
+                CopyFields(field_to_field=[["output/0", "target"]], use_query=True),
+            ],
+            task=FormTask(inputs=["input"], outputs=["target"], metrics=["metrics.rouge"]),
+            instructions=InstructionsList([f"instructions.natural_instructions.{split}.{task}"]),
+            templates=TemplatesList([f"templates.input_output"]),
+        )
+
+        try:
+            test_card(card)
+        except Exception as e:
+            print("error while generating task", task)
+            print(e)
+
+        add_to_catalog(card, f"cards.natural_instructions.{split}.{task}", overwrite=True)
diff --git a/prepare/formats/input_output_prefix.py b/prepare/formats/input_output_prefix.py
@@ -0,0 +1,9 @@
+from src.unitxt.catalog import add_to_catalog
+from src.unitxt.formats import ICLFormat
+
+format = ICLFormat(
+    input_prefix="input: ",
+    output_prefix="output: ",
+)
+
+add_to_catalog(format, f"formats.input_output_prefix", overwrite=True)
diff --git a/prepare/templates/input_output.py b/prepare/templates/input_output.py
@@ -0,0 +1,8 @@
+from src.unitxt.blocks import InputOutputTemplate
+from src.unitxt.catalog import add_to_catalog
+
+template = InputOutputTemplate(
+    input_format="{input}",
+    output_format="{target}",
+)
+add_to_catalog(template, f"templates.input_output", overwrite=True)
diff --git a/src/unitxt/catalog/cards/natural_instructions/train/task001_quoref_question_generation.json b/src/unitxt/catalog/cards/natural_instructions/train/task001_quoref_question_generation.json
@@ -0,0 +1,53 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "data_files": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task001_quoref_question_generation.json",
+        "field": "Instances",
+        "path": "json"
+    },
+    "preprocess_steps": [
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "train[90%]",
+                "validation": "train[5%]",
+                "test": "train[5%]"
+            }
+        },
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "output/0",
+                    "target"
+                ]
+            ],
+            "use_query": true
+        }
+    ],
+    "task": {
+        "type": "form_task",
+        "inputs": [
+            "input"
+        ],
+        "outputs": [
+            "target"
+        ],
+        "metrics": [
+            "metrics.rouge"
+        ]
+    },
+    "instructions": {
+        "type": "instructions_list",
+        "items": [
+            "instructions.natural_instructions.train.task001_quoref_question_generation"
+        ]
+    },
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            "templates.input_output"
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/cards/natural_instructions/train/task002_quoref_answer_generation.json b/src/unitxt/catalog/cards/natural_instructions/train/task002_quoref_answer_generation.json
@@ -0,0 +1,53 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "data_files": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task002_quoref_answer_generation.json",
+        "field": "Instances",
+        "path": "json"
+    },
+    "preprocess_steps": [
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "train[90%]",
+                "validation": "train[5%]",
+                "test": "train[5%]"
+            }
+        },
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "output/0",
+                    "target"
+                ]
+            ],
+            "use_query": true
+        }
+    ],
+    "task": {
+        "type": "form_task",
+        "inputs": [
+            "input"
+        ],
+        "outputs": [
+            "target"
+        ],
+        "metrics": [
+            "metrics.rouge"
+        ]
+    },
+    "instructions": {
+        "type": "instructions_list",
+        "items": [
+            "instructions.natural_instructions.train.task002_quoref_answer_generation"
+        ]
+    },
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            "templates.input_output"
+        ]
+    }
+}
diff --git a/...talog/cards/natural_instructions/train/task022_cosmosqa_passage_inappropriate_binary.json b/...talog/cards/natural_instructions/train/task022_cosmosqa_passage_inappropriate_binary.json
@@ -0,0 +1,53 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "data_files": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task022_cosmosqa_passage_inappropriate_binary.json",
+        "field": "Instances",
+        "path": "json"
+    },
+    "preprocess_steps": [
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "train[90%]",
+                "validation": "train[5%]",
+                "test": "train[5%]"
+            }
+        },
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "output/0",
+                    "target"
+                ]
+            ],
+            "use_query": true
+        }
+    ],
+    "task": {
+        "type": "form_task",
+        "inputs": [
+            "input"
+        ],
+        "outputs": [
+            "target"
+        ],
+        "metrics": [
+            "metrics.rouge"
+        ]
+    },
+    "instructions": {
+        "type": "instructions_list",
+        "items": [
+            "instructions.natural_instructions.train.task022_cosmosqa_passage_inappropriate_binary"
+        ]
+    },
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            "templates.input_output"
+        ]
+    }
+}
diff --git a/...unitxt/catalog/cards/natural_instructions/train/task023_cosmosqa_question_generation.json b/...unitxt/catalog/cards/natural_instructions/train/task023_cosmosqa_question_generation.json
@@ -0,0 +1,53 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "data_files": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task023_cosmosqa_question_generation.json",
+        "field": "Instances",
+        "path": "json"
+    },
+    "preprocess_steps": [
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "train[90%]",
+                "validation": "train[5%]",
+                "test": "train[5%]"
+            }
+        },
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "output/0",
+                    "target"
+                ]
+            ],
+            "use_query": true
+        }
+    ],
+    "task": {
+        "type": "form_task",
+        "inputs": [
+            "input"
+        ],
+        "outputs": [
+            "target"
+        ],
+        "metrics": [
+            "metrics.rouge"
+        ]
+    },
+    "instructions": {
+        "type": "instructions_list",
+        "items": [
+            "instructions.natural_instructions.train.task023_cosmosqa_question_generation"
+        ]
+    },
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            "templates.input_output"
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/cards/natural_instructions/train/task024_cosmosqa_answer_generation.json b/src/unitxt/catalog/cards/natural_instructions/train/task024_cosmosqa_answer_generation.json
@@ -0,0 +1,53 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "data_files": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task024_cosmosqa_answer_generation.json",
+        "field": "Instances",
+        "path": "json"
+    },
+    "preprocess_steps": [
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "train[90%]",
+                "validation": "train[5%]",
+                "test": "train[5%]"
+            }
+        },
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "output/0",
+                    "target"
+                ]
+            ],
+            "use_query": true
+        }
+    ],
+    "task": {
+        "type": "form_task",
+        "inputs": [
+            "input"
+        ],
+        "outputs": [
+            "target"
+        ],
+        "metrics": [
+            "metrics.rouge"
+        ]
+    },
+    "instructions": {
+        "type": "instructions_list",
+        "items": [
+            "instructions.natural_instructions.train.task024_cosmosqa_answer_generation"
+        ]
+    },
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            "templates.input_output"
+        ]
+    }
+}