<a href="https://colab.research.google.com/github/MoritzLaurer/zeroshot-classifier/blob/main/3_data_formatting_universal_nli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Universal task formatting
This notebook formats all non-NLI datasets into the universal NLI format and adds standard NLI datasets. The result is the final train and test data.

### Install and setup

In [None]:
!pip install datasets~=2.14.0 -qq

In [None]:
## load packages
import torch
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
import os
import re
import time
import random
import tqdm
from collections import Counter

from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from datasets import ClassLabel

from google.colab.data_table import DataTable
from google.colab import data_table
from IPython.display import display
data_table.enable_dataframe_formatter() # https://colab.research.google.com/notebooks/data_table.ipynb#scrollTo=JgBtx0xFFv_i

## set global seed for reproducibility and against seed hacking
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

In [None]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

#set wd
print(os.getcwd())
os.chdir("/content/drive/My Drive/PhD/zero-shot-models")
print(os.getcwd())

# local config.py file with tokens
import config

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/My Drive/PhD/zero-shot-models


### Overarching functions

In [None]:
# functions for formatting the data into the universal NLI format
# note that train and test data needs to be handled differently

def format_nli_trainset(df_train=None, hypo_label_dic=None, random_seed=42):
  print(f"\nFor NLI: Augmenting data by adding random not_entail examples to the train set from other classes within the train set.")
  print(f"Length of df_train before this step is: {len(df_train)}.\n")
  print(f"Max augmentation can be: len(df_train) * 2 = {len(df_train)*2}. Can also be lower, if there are more entail examples than not-entail for a majority class")

  df_train_lst = []
  for label_text, hypothesis in hypo_label_dic.items():
    ## entailment
    df_train_step = df_train[df_train.label_text == label_text].copy(deep=True)
    df_train_step["hypothesis"] = [hypothesis] * len(df_train_step)
    df_train_step["labels"] = [0] * len(df_train_step)
    ## not_entailment
    df_train_step_not_entail = df_train[df_train.label_text != label_text].copy(deep=True)
    # could try weighing the sample texts for not_entail here. e.g. to get same n texts for each label
    df_train_step_not_entail = df_train_step_not_entail.sample(n=min(len(df_train_step), len(df_train_step_not_entail)), random_state=random_seed)  # can try sampling more not_entail here
    df_train_step_not_entail["hypothesis"] = [hypothesis] * len(df_train_step_not_entail)
    df_train_step_not_entail["labels"] = [1] * len(df_train_step_not_entail)
    # append
    df_train_lst.append(pd.concat([df_train_step, df_train_step_not_entail]))
  df_train = pd.concat(df_train_lst)

  # shuffle
  df_train = df_train.sample(frac=1, random_state=random_seed)
  df_train["labels"] = df_train.labels.apply(int)
  print(f"For NLI:  not_entail training examples were added, which leads to an augmented training dataset of length {len(df_train)}.")

  return df_train.copy(deep=True)


def format_nli_testset(df_test=None, hypo_label_dic=None):

  hypothesis_lst = [value for key, value in hypo_label_dic.items()]
  print("Number of hypotheses/classes: ", len(hypothesis_lst), "\n")

  # labels lists with 0 at alphabetical position of their true hypo, 1 for other hypos
  label_text_label_dic_explode = {}
  for key, value in hypo_label_dic.items():
    label_lst = [0 if value == hypo else 1 for hypo in hypothesis_lst]
    label_text_label_dic_explode[key] = label_lst

  df_test_copy = df_test.copy(deep=True)
  df_test_copy["labels"] = df_test_copy.label_text.map(label_text_label_dic_explode)
  df_test_copy["hypothesis"] = [hypothesis_lst] * len(df_test_copy)
  print(f"For normal test, N classifications necessary: {len(df_test_copy)}")

  # explode dataset to have K-1 additional rows with not_entail labels and K-1 other hypotheses
  # ! after exploding, cannot sample anymore, because distorts the order to true labels values, which needs to be preserved for evaluation multilingual-repo
  print("Reminder: do not sample these test-sets anymore after formatting. Row order needs to be preserved for testing.")
  df_test_copy = df_test_copy.explode(["hypothesis", "labels"])  # multi-column explode requires pd.__version__ >= '1.3.0'
  print(f"For NLI test, N classifications necessary: {len(df_test_copy)}\n")

  return df_test_copy


### Formulate hypotheses for each task/class

In [None]:
# Hugging Face 0-shot pipeline template is f"This example is {}"
# https://huggingface.co/docs/transformers/v4.35.0/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline.__call__.hypothesis_template

task_hypotheses = {
    "wellformedquery":
        {
        "well_formed": "This example is a well formed Google query.",
        "not_well_formed": "This example is not a well formed Google query"
        },
    "biasframes_sex":
        {
        "sex": "This example contains allusions to sexual content.",
        "not_sex": "This example does not contain allusions to sexual content."
        },
    "biasframes_intent":
        {
        "intent": "The intent of this example is to be offensive/disrespectful.",
        "not_intent": "The intent of this example is not to be offensive/disrespectful."
        },
    "biasframes_offensive":
        {
        "offensive": "This example could be considered offensive, disrespectful, or toxic.",
        "not_offensive": "This example could not be considered offensive, disrespectful, or toxic."
        },
    "financialphrasebank":
        {
        "negative": "The sentiment in this example is negative from an investor's perspective.",
        "neutral": "The sentiment in this example is neutral from an investor's perspective.",
        "positive": "The sentiment in this example is positive from an investor's perspective."
        },
    "rottentomatoes":
        {
        "negative": "The sentiment in this example rotten tomatoes movie review is negative",
        "positive": "The sentiment in this example rotten tomatoes movie review is positive"
        },
    "amazonpolarity":
        {
        "negative": "The sentiment in this example amazon product review is negative",
        "positive": "The sentiment in this example amazon product review is positive"
        },
    "imdb":
        {
        "negative": "The sentiment in this example imdb movie review is negative",
        "positive": "The sentiment in this example imdb movie review is positive"
        },
    "appreviews":
        {
        "positive": "The sentiment in this example app review is positive.",
        "negative": "The sentiment in this example app review is negative."
        },
    "yelpreviews":
        {
        "positive": "The sentiment in this example yelp review is positive.",
        "negative": "The sentiment in this example yelp review is negative."
        },
    'wikitoxic_toxicaggregated':
        {
        'toxicaggregated': 'This example wikipedia comment contains toxic language.',
        'not_toxicaggregated': 'This example wikipedia comment does not contain toxic language.'
        },
    'wikitoxic_obscene':
        {
        'obscene': 'This example wikipedia comment contains obscene language.',
        'not_obscene': 'This example wikipedia comment does not contain obscene language.'
        },
    'wikitoxic_threat':
        {
        'threat': 'This example wikipedia comment contains a threat.',
        'not_threat': 'This example wikipedia comment does not contain a threat.'
        },
    'wikitoxic_insult':
        {
        'insult': 'This example wikipedia comment contains an insult.',
        'not_insult': 'This example wikipedia comment does not contain an insult.'
        },
    'wikitoxic_identityhate':
        {
        'identityhate': 'This example wikipedia comment contains identity hate.',
        'not_identityhate': 'This example wikipedia comment does not contain identity hate.'
        },
    "hateoffensive":
        {
        "hate_speech": "This example tweet contains hate speech.",
        "offensive": "This example tweet contains offensive language without hate speech.",
        "neither": "This example tweet contains neither offensive language nor hate speech.",
        },
    "hatexplain":
        {
        "hate_speech": "This example text from twitter or gab contains hate speech.",
        "offensive": "This example text from twitter or gab contains offensive language without hate speech.",
        "neither": "This example text from twitter or gab contains neither offensive language nor hate speech.",
        },
    "spam":
         {
         "spam": "This example sms is spam.",
         "not_spam": "This example sms is not spam.",
         },
    "emotiondair":
        {
        'anger': "This example tweet expresses the emotion: anger",
        'fear': "This example tweet expresses the emotion: fear",
        'joy': "This example tweet expresses the emotion: joy",
        'love': "This example tweet expresses the emotion: love",
        'sadness': "This example tweet expresses the emotion: sadness",
        'surprise': "This example tweet expresses the emotion: surprise",
        },
    "emocontext":
        {
        'angry': "This example tweet expresses the emotion: anger",
        'sad': "This example tweet expresses the emotion: sadness",
        'happy': "This example tweet expresses the emotion: happiness",
        'others': "This example tweet does not express any of the emotions: anger, sadness, or happiness",
        },
    "empathetic":
        {
        'afraid': 'The main emotion of this example dialogue is: afraid',
        'angry': 'The main emotion of this example dialogue is: angry',
        'annoyed': 'The main emotion of this example dialogue is: annoyed',
        'anticipating': 'The main emotion of this example dialogue is: anticipating',
        'anxious': 'The main emotion of this example dialogue is: anxious',
        'apprehensive': 'The main emotion of this example dialogue is: apprehensive',
        'ashamed': 'The main emotion of this example dialogue is: ashamed',
        'caring': 'The main emotion of this example dialogue is: caring',
        'confident': 'The main emotion of this example dialogue is: confident',
        'content': 'The main emotion of this example dialogue is: content',
        'devastated': 'The main emotion of this example dialogue is: devastated',
        'disappointed': 'The main emotion of this example dialogue is: disappointed',
        'disgusted': 'The main emotion of this example dialogue is: disgusted',
        'embarrassed': 'The main emotion of this example dialogue is: embarrassed',
        'excited': 'The main emotion of this example dialogue is: excited',
        'faithful': 'The main emotion of this example dialogue is: faithful',
        'furious': 'The main emotion of this example dialogue is: furious',
        'grateful': 'The main emotion of this example dialogue is: grateful',
        'guilty': 'The main emotion of this example dialogue is: guilty',
        'hopeful': 'The main emotion of this example dialogue is: hopeful',
        'impressed': 'The main emotion of this example dialogue is: impressed',
        'jealous': 'The main emotion of this example dialogue is: jealous',
        'joyful': 'The main emotion of this example dialogue is: joyful',
        'lonely': 'The main emotion of this example dialogue is: lonely',
        'nostalgic': 'The main emotion of this example dialogue is: nostalgic',
        'prepared': 'The main emotion of this example dialogue is: prepared',
        'proud': 'The main emotion of this example dialogue is: proud',
        'sad': 'The main emotion of this example dialogue is: sad',
        'sentimental': 'The main emotion of this example dialogue is: sentimental',
        'surprised': 'The main emotion of this example dialogue is: surprised',
        'terrified': 'The main emotion of this example dialogue is: terrified',
        'trusting': 'The main emotion of this example dialogue is: trusting'
        },
    "agnews":
        {
        'Business': "This example news text is about business news",
        'Sci/Tech': "This example news text is about science and technology",
        'Sports': "This example news text is about sports",
        'World': "This example news text is about world news"
        },
    "yahootopics":
        {
        'Business & Finance': 'This example question from the Yahoo Q&A forum is categorized in the topic: Business & Finance',
        'Computers & Internet': 'This example question from the Yahoo Q&A forum is categorized in the topic: Computers & Internet',
        'Education & Reference': 'This example question from the Yahoo Q&A forum is categorized in the topic: Education & Reference',
        'Entertainment & Music': 'This example question from the Yahoo Q&A forum is categorized in the topic: Entertainment & Music',
        'Family & Relationships': 'This example question from the Yahoo Q&A forum is categorized in the topic: Family & Relationships',
        'Health': 'This example question from the Yahoo Q&A forum is categorized in the topic: Health',
        'Politics & Government': 'This example question from the Yahoo Q&A forum is categorized in the topic: Politics & Government',
        'Science & Mathematics': 'This example question from the Yahoo Q&A forum is categorized in the topic: Science & Mathematics',
        'Society & Culture': 'This example question from the Yahoo Q&A forum is categorized in the topic: Society & Culture',
        'Sports': 'This example question from the Yahoo Q&A forum is categorized in the topic: Sports'
        },
    "massive":
        {
        'datetime_query': "The intent of this example utterance is a datetime query.",
        'iot_hue_lightchange': "The intent of this example utterance is changing the light.",
        'transport_ticket': "This example utterance is about transport tickets.",
        'takeaway_query': "This example utterance is about takeaway food.",
        'qa_stock': "This example utterance is about stocks.",
        'general_greet': "This example utterance is a general greet.",
        'recommendation_events': "This example utterance is about event recommendations.",
        'music_dislikeness': "The intent of this example utterance is signalling music dislike.",
        'iot_wemo_off': "The intent of this example utterance is turning an IoT device off.",
        'cooking_recipe': "This example utterance is about cooking recipies.",
        'qa_currency': "This example utteranceis about currencies.",
        'transport_traffic': "This example utterance is about transport or traffic.",
        'general_quirky': np.nan,  # unclear category, better to exclude
        'weather_query': "This example utterance is a query about the wheather.",
        'audio_volume_up': "The intent of this example utterance is turning the audio volume up.",
        'email_addcontact': "The intent of this example utterance is adding an email address to contacts.",
        'takeaway_order': "The intent of this example utterance is to order takeaway food.",
        'email_querycontact': "The intent of this example utterance is to query contact details.",
        'iot_hue_lightup': "The intent of this example utterance is to brighten lights.",
        'recommendation_locations': "The intent of this example utterance is receiving recommendations for good locations.",
        'play_audiobook': "The example utterance is related to playing audiobooks.",
        'lists_createoradd': "The example utterance is related to creating or adding to lists.",
        'news_query': "The example utterance is a query about the news.",
        'alarm_query': "The example utterance is a query about alarms.",
        'iot_wemo_on': "The intent of the example utterance is to turn an IoT device on.",
        'general_joke': "The intent of the example utterance is to hear a joke.",
        'qa_definition': "The example utterance is a query about a definition.",
        'social_query': "The example utterance is a query about a social network.",
        'music_settings': "The intent of the example utterance is to change music settings.",
        'audio_volume_other': "The example utterance is related to audio volume.",
        'calendar_remove': "The intent of the example utterance is to remove something from a calendar.",
        'iot_hue_lightdim': "The intent of the example utterance is to dim the lights.",
        'calendar_query': "The example utterance is a query about a calendar.",
        'email_sendemail': "The intent of the example utterance is to send an email.",
        'iot_cleaning': "The intent of the example utterance is for an IoT device to start cleaning.",
        'audio_volume_down': "The intent of the example utterance is to lower the volume.",
        'play_radio': "The intent of the example utterance is to play something on the radio.",
        'cooking_query': "The example utterance is a query about cooking.",
        'datetime_convert': "The example utterance is related to date time changes or conversion.",
        'qa_maths': "The example utterance is a question about maths.",
        'iot_hue_lightoff': "The example utterance is related to turning the lights off.",
        'iot_hue_lighton': "The example utterance is related to turning the lights on.",
        'transport_query': "The example utterance is a query about transport or travels.",
        'music_likeness': "The example utterance is related to liking music.",
        'email_query': "The example utterance is a query about emails.",
        'play_music': "The intent of this example utterance is for an IoT device to play music.",
        'audio_volume_mute': "The intent of this example utterance is to mute the volume.",
        'social_post': "The example utterance is about social media posts.",
        'alarm_set': "The intent of the example utterance is to set an alarm.",
        'qa_factoid': "The example utterance is a factoid question.",
        'calendar_set': "The intent of this example utterance is to set something in a calendar.",
        'play_game': "The intent of this example utterance is to start playing a game.",
        'alarm_remove': "The intent of this example utterance is to remove an alarm.",
        'lists_remove': "The intent of this example utterance is to remove a list or remove something from a list.",
        'transport_taxi': "The intent of this example utterance is to get a taxi.",
        'recommendation_movies': "This example utterance is about movie recommendations.",
        'iot_coffee': "The intent of this example utterance is for an IoT device to make coffee.",
        'music_query': "The example utterance is a query about music.",
        'play_podcasts': "The example utterance is related to playing podcasts.",
        'lists_query': "The example utterance is a query about a list."
        },
    "banking77":
        {
        'activate_my_card': "This banking customer example message is about activating a card.",
        'age_limit': "This banking customer example message is related to age limits.",
        'apple_pay_or_google_pay': "This banking customer example message is about apple pay or google pay",
        'atm_support': "This banking customer example message requests ATM support.",
        'automatic_top_up': "This banking customer example message is about automatic top up.",
        'balance_not_updated_after_bank_transfer': "This banking customer example message is about a balance not updated after a transfer.",
        'balance_not_updated_after_cheque_or_cash_deposit': "This banking customer example message is about a balance not updated after a cheque or cash deposit.",
        'beneficiary_not_allowed': "This banking customer example message is related to a beneficiary not being allowed or a failed transfer.",
        'cancel_transfer': "This banking customer example message is related to the cancellation of a transfer.",
        'card_about_to_expire': "This banking customer example message is related to the expiration of a card.",
        'card_acceptance': "This banking customer example message is related to the scope of acceptance of a card.",
        'card_arrival': "This banking customer example message is about the arrival of a card.",
        'card_delivery_estimate': "This banking customer example message is about a card delivery estimate or timing.",
        'card_linking': np.nan,  # category does not seem coherent.
        'card_not_working': "This banking customer example message is about a card not working.",
        'card_payment_fee_charged': "This banking customer example message is about a card payment fee.",
        'card_payment_not_recognised': "This banking customer example message is about a payment the customer does not recognise.",
        'card_payment_wrong_exchange_rate': "This banking customer example message is about a wrong exchange rate.",
        'card_swallowed': "This banking customer example message is about a card swallowed by a machine.",
        'cash_withdrawal_charge': "This banking customer example message is about a cash withdrawal charge.",
        'cash_withdrawal_not_recognised': "This banking customer example message is about an unrecognised cash withdrawal.",
        'change_pin': "This banking customer example message is about changing a pin code.",
        'compromised_card': "This banking customer example message is about a compromised card.",
        'contactless_not_working': "This banking customer example message is about contactless not working",
        'country_support': "This banking customer example message is about country-specific support.",
        'declined_card_payment': "This banking customer example message is about a declined card payment.",
        'declined_cash_withdrawal': "This banking customer example message is about a declined cash withdrawal.",
        'declined_transfer': "This banking customer example message is about a declined transfer.",
        'direct_debit_payment_not_recognised': "This banking customer example message is about an unrecognised direct debit payment.",
        'disposable_card_limits': "This banking customer example message is about the limits of disposable cards.",
        'edit_personal_details': "This banking customer example message is about editing personal details.",
        'exchange_charge': "This banking customer example message is about exchange rate charges.",
        'exchange_rate': "This banking customer example message is about exchange rates.",
        'exchange_via_app': np.nan, # noisy category
        'extra_charge_on_statement': "This banking customer example message is about an extra charge.",
        'failed_transfer': "This banking customer example message is about a failed transfer.",
        'fiat_currency_support': "This banking customer example message is about fiat currency support",
        'get_disposable_virtual_card': "This banking customer example message is about getting a disposable virtual card.",
        'get_physical_card': np.nan,  # noisy category
        'getting_spare_card': "This banking customer example message is about getting a spare card.",
        'getting_virtual_card': "This banking customer example message is about getting a virtual card.",
        'lost_or_stolen_card': "This banking customer example message is about a lost or stolen card.",
        'lost_or_stolen_phone': "This banking customer example message is about a lost or stolen phone.",
        'order_physical_card': "This banking customer example message is about ordering a card.",
        'passcode_forgotten': "This banking customer example message is about a forgotten passcode.",
        'pending_card_payment': "This banking customer example message is about a pending card payment.",
        'pending_cash_withdrawal': "This banking customer example message is about a pending cash withdrawal.",
        'pending_top_up': "This banking customer example message is about a pending top up.",
        'pending_transfer': "This banking customer example message is about a pending transfer.",
        'pin_blocked': "This banking customer example message is about a blocked pin.",
        'receiving_money': "This banking customer example message is about receiving money.",
        'Refund_not_showing_up': "This customer example message is about a refund not showing up.",
        'request_refund': "This banking customer example message is about a refund request.",
        'reverted_card_payment?': "This banking customer example message is about reverting a card payment.",
        'supported_cards_and_currencies': np.nan,  # don't understand category.
        'terminate_account': "This banking customer example message is about terminating an account.",
        'top_up_by_bank_transfer_charge': np.nan,  # noisy
        'top_up_by_card_charge': "This banking customer example message is about the charge for topping up by card.",
        'top_up_by_cash_or_cheque': "This banking customer example message is about topping up by cash or cheque.",
        'top_up_failed': "This banking customer example message is about top up issues or failures.",
        'top_up_limits': "This banking customer example message is about top up limitations.",
        'top_up_reverted': "This banking customer example message is about issues with topping up.",
        'topping_up_by_card': "This banking customer example message is about topping up by card.",
        'transaction_charged_twice': "This banking customer example message is about a transaction charged twice.",
        'transfer_fee_charged': "This banking customer example message is about an issue with a transfer fee charge.",
        'transfer_into_account': "This banking customer example message is about transfers into the customer's own account.",
        'transfer_not_received_by_recipient': "This banking customer example message is about a transfer that has not arrived yet.",
        'transfer_timing': "This banking customer example message is about transfer timing.",
        'unable_to_verify_identity': "This banking customer example message is about an issue with identity verification.",
        'verify_my_identity': "This banking customer example message is about identity verification.",
        'verify_source_of_funds': "This banking customer example message is about the source of funds.",
        'verify_top_up': "This banking customer example message is about verification and top ups",
        'virtual_card_not_working': "This banking customer example message is about a virtual card not working",
        'visa_or_mastercard': "This banking customer example message is about types of bank cards.",
        'why_verify_identity': "This banking customer example message questions why identity verification is necessary.",
        'wrong_amount_of_cash_received': "This banking customer example message is about a wrong amount of cash received.",
        'wrong_exchange_rate_for_cash_withdrawal': "This banking customer example message is about a wrong exchange rate for a cash withdrawal."
        },
    "trueteacher":
        {
        "factually_inconsistent": "The example summary is factually inconsistent with the full article.",
        "factually_consistent": "The example summary is factually consistent with the full article.",
        },
    "capsotu":
        {
        'Agriculture': "This example text from a US presidential speech is about agriculture",
        'Culture': "This example text from a US presidential speech is about cultural policy",
        'Civil Rights': "This example text from a US presidential speech is about civil rights or minorities or civil liberties",
        'Defense': "This example text from a US presidential speech is about defense or military",
        'Domestic Commerce': "This example text from a US presidential speech is about banking or finance or commerce",
        'Education': "This example text from a US presidential speech is about education",
        'Energy': "This example text from a US presidential speech is about energy or electricity or fossil fuels",
        'Environment': "This example text from a US presidential speech is about the environment or water or waste or pollution",
        'Foreign Trade': "This example text from a US presidential speech is about foreign trade",
        'Government Operations': "This example text from a US presidential speech is about government operations or administration",
        'Health': "This example text from a US presidential speech is about health",
        'Housing': "This example text from a US presidential speech is about community development or housing issues",
        'Immigration': "This example text from a US presidential speech is about migration",
        'International Affairs': "This example text from a US presidential speech is about international affairs or foreign aid",
        'Labor': "This example text from a US presidential speech is about employment or labour",
        'Law and Crime': "This example text from a US presidential speech is about law, crime or family issues",
        'Macroeconomics': "This example text from a US presidential speech is about macroeconomics",
        #'Other': "This example text from a US presidential speech is about other, miscellaneous",
        'Public Lands': "This example text from a US presidential speech is about public lands or water management",
        'Social Welfare': "This example text from a US presidential speech is about social welfare",
        'Technology': "This example text from a US presidential speech is about space or science or technology or communications",
        'Transportation': "This example text from a US presidential speech is about transportation",
        },
    "manifesto":
        {
        "Welfare State Expansion": "This example text from a political party manifesto is positive towards the welfare state, e.g. health care, pensions or social housing",
        "Technology and Infrastructure: Positive": "This example text from a political party manifesto is about technology and infrastructure, e.g. the importance of modernisation of industry, or supportive of public spending on infrastructure/tech",
        "Education Expansion": "This example text from a political party manifesto is about the need to expand/improve policy on education",
        "Environmental Protection": "This example text from a political party manifesto is in favour of environmental protection, e.g. fighting climate change or 'green' policies or preservation of natural resources or animal rights",
        "Law and Order: Positive": "This example text from a political party manifesto is positive towards law and order and strict law enforcement",
        "Equality: Positive": "This example text from a political party manifesto is positive towards equality or social justice, e.g. protection of underprivileged groups or fair distribution of resources",
        "Agriculture and Farmers: Positive": "This example text from a political party manifesto is positive towards policies for agriculture and farmers",
        "Market Regulation": "This example text from a political party manifesto is supports market regulation for a fair and open market, for example for consumer protection or for increased competition or for social market economy",
        "Incentives: Positive": "This example text from a political party manifesto is favourable towards supply side economic policies supporting businesses, for example for incentives like subsidies or tax breaks",
        "Governmental and Administrative Efficiency": "This example text from a political party manifesto is in favour of efficiency in government/administration, for example by restructuring civil service or improving bureaucracy",
        "Labour Groups: Positive": "This example text from a political party manifesto is positive towards labour groups, for example for good working conditions, fair wages or unions",
        "Decentralization": "This example text from a political party manifesto is for decentralisation or federalism",
        "Anti-Growth Economy: Positive": "This example text from a political party manifesto is in favour of anti-growth politics",
        "Culture: Positive": "This example text from a political party manifesto is in favour of cultural policies or leisure facilities, for example museus, libraries or public sport clubs",
        "Non-economic Demographic Groups": "This example text from a political party manifesto favourably mentions non-economic demographic groups like women, students or specific age groups",
        "Military: Positive": "This example text from a political party manifesto is positive towards the military, for example for military spending or rearmament or military treaty obligations",
        "Political Authority": "This example text from a political party manifesto mentions the speaker's competence to govern or other party's lack of such competence, or favourably mentions a strong/stable government",
        "Internationalism: Positive": "This example text from a political party manifesto is in favour of international cooperation with other countries, for example mentions the need for aid to developing countries, or global governance",
        "Democracy": "This example text from a political party manifesto favourably mentions democracy or democratic procedures or institutions",
        "Multiculturalism: Positive": "This example text from a political party manifesto favourably mentions cultural diversity, for example for freedom of religion or linguistic heritages",
        "Freedom and Human Rights": "This example text from a political party manifesto is in favour of freedom and human rights, for example freedom of speech, assembly or against state coercion or for individualism",
        "Economic Growth: Positive": "This example text from a political party manifesto is supportive of economic growth, for example facilitation of more production or government aid for growth",
        "National Way of Life: Positive": "This example text from a political party manifesto is positive towards the national way of life and history, for example pride of citizenship or appeals to patriotism",
        "Underprivileged Minority Groups": "This example text from a political party manifesto favourably mentions underprivileged minorities, for example handicapped, homosexuals or immigrants",
        "Economic Orthodoxy": "This example text from a political party manifesto is for economic orthodoxy, for example reduction of budget deficits, thrift or a strong currency",
        "Traditional Morality: Positive": "This example text from a political party manifesto is favourable towards traditional or religious values, for example for censorship of immoral behavour, for traditional family values or religious institutions",
        "Free Market Economy": "This example text from a political party manifesto is in favour of a free market economy and capitalism",
        "European Community/Union: Positive": "This example text from a political party manifesto is positive towards the EU or European Community, for example EU expansion and integration",
        "Civic Mindedness: Positive": "This example text from a political party manifesto is positive towards national solidarity, civil society or appeals for public spiritedness or against anti-social attitudes",
        "Welfare State Limitation": "This example text from a political party manifesto is for limiting the welfare state, for example public funding for social services or social security, e.g. private care before state care",
        "National Way of Life: Negative": "This example text from a political party manifesto unfavourably mentions a country's nation and history, for example sceptical towards patriotism or national pride",
        "Foreign Special Relationships: Positive": "This example text from a political party manifesto is positive towards particular countries",
        "Protectionism: Negative": "This example text from a political party manifesto is negative towards protectionism, in favour of free trade",
        "European Community/Union: Negative": "This example text from a political party manifesto negatively mentions the EU or European Community",
        "Nationalisation": "This example text from a political party manifesto is positive towards government ownership of industries or land or for economic nationalisation",
        "Political Corruption": "This example text from a political party manifesto is negative towards political corruption or abuse of political/bureaucratic power",
        "Protectionism: Positive": "This example text from a political party manifesto is in favour of protectionism, for example tariffs, export subsidies",
        "Military: Negative": "This example text from a political party manifesto is negative towards the military, for example for decreasing military spending or disarmament",
        "Economic Planning": "This example text from a political party manifesto is positive towards government economic planning, e.g. policy plans or strategies ",
        "Constitutionalism: Negative": "This example text from a political party manifesto is positive towards constitutionalism",
        "Economic Goals": "This example text from a political party manifesto is a broad/general statement on economic goals without specifics",
        "Middle Class and Professional Groups": "This example text from a political party manifesto favourably references the middle class, e.g. white colar groups or the service sector",
        "Controlled Economy": "This example text from a political party manifesto is supportive of direct government control of the economy, e.g. price control or minimum wages",
        "Traditional Morality: Negative": "This example text from a political party manifesto is negative towards traditional morality, for example against religious moral values, for divorce or abortion, for modern families or separation of church and state",
        "Labour Groups: Negative": "This example text from a political party manifesto is negative towards labour groups and unions",
        "Peace": "This example text from a political party manifesto is positive towards peace and peaceful means of solving crises, for example in favour of negotiations and ending wars",
        "Constitutionalism: Positive": "This example text from a political party manifesto is positive towards constitutionalism and the status quo of the constitution",
        "Internationalism: Negative": "This example text from a political party manifesto is sceptical of internationalism, for example negative towards international cooperation, in favour of national sovereignty and unilaterialism",
        "Corporatism/Mixed Economy": "This example text from a political party manifesto is positive towards cooperation of government, employers, and trade unions simultaneously",
        "Education Limitation": "This example text from a political party manifesto is sceptical towards state expenditure on education, for example in favour of study fees or private schools",
        "Marxist Analysis": "This example text from a political party manifesto is positive towards Marxist-Leninist ideas or uses specific Marxist terminology",
        "Multiculturalism: Negative": "This example text from a political party manifesto is sceptical towards multiculturalism, or for cultural integration or appeals to cultural homogeneity in society",
        "Keynesian Demand Management": "This example text from a political party manifesto is for keynesian demand management and demand side economic policies",
        "Foreign Special Relationships: Negative": "This example text from a political party manifesto is negative towards particular countries",
        "Centralisation": "This example text from a political party manifesto is in favour of political centralisation",
        "Anti-Imperialism": "This example text from a political party manifesto is anti-imperialistic, for example against controlling other countries and for greater self-government of colonies",
        }
}


# re-order all hypotheses to be alphabetical based on label_text
# to avoid potential issues of label_num - label_text - hypothesis mismatch
task_hypotheses = {key_task_name: dict(sorted(value_task_hypo_dict.items())) for key_task_name, value_task_hypo_dict in task_hypotheses.items()}


### Load non-NLI datasets

In [None]:
# manually written task names for validating that code doesn't miss anything
task_names_manual = [
    'wellformedquery', 'financialphrasebank', 'rottentomatoes', 'amazonpolarity',
    'imdb', 'appreviews', 'yelpreviews', 'wikitoxic_toxicaggregated',
    'wikitoxic_obscene', 'wikitoxic_threat', 'wikitoxic_insult',
    'wikitoxic_identityhate', 'hateoffensive', 'hatexplain',
    'trueteacher', 'spam', 'massive', 'banking77', 'emotiondair',
    'emocontext', 'empathetic', 'agnews', 'yahootopics',
    'biasframes_offensive', 'biasframes_sex', 'biasframes_intent',
    # added for v1.1
    "manifesto", "capsotu",
    # to be included in v2
    #'anthropic_harmless', 'anthropic_helpful',
]

tasks_to_exclude = [
    'anthropic_harmless', 'anthropic_helpful',
]


#### Load df_train

In [None]:
## load (cleaned) train files

def find_files(directory, filter_string=None):
    # List all files dataset directory
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    # Filter files that have certain word
    train_files = [f for f in all_files if filter_string in os.path.basename(f)]
    return train_files

# find cleaned files
directory_path_cl = './datasets_clean'
train_files_lst_cl = find_files(directory_path_cl, filter_string="train")
# also add datasets that did not processed for automatic cleaning
directory_path_uncleaned = './datasets_standardized'
datasets_no_automatic_clean = [
    "trueteacher", #"anthropic_harmless", "anthropic_helpful",
    #"manifesto", "capsotu"
]
train_files_lst_uncleaned = find_files(directory_path_uncleaned, filter_string="train")
train_files_lst_uncleaned = [path for path in train_files_lst_uncleaned if any(dataset in path for dataset in datasets_no_automatic_clean)]

train_files_lst = train_files_lst_cl + train_files_lst_uncleaned

print("All identified files: ", train_files_lst)
print("N files: ", len(train_files_lst))

# task name extraction from dataset files
pattern = re.compile(r'^\.(?:/datasets_clean|/datasets_standardized)/ds_|_train.*$')
# Apply the regular expression to each string in the list
task_names_train = [re.sub(pattern, '', fp) for fp in train_files_lst]

print("task names: ", task_names_train)

# double check if all intended dataset paths and task names  were loaded
print("N tasks: ", len(task_names_manual))

# test if dataset mismatch
unique_to_list1 = set(task_names_manual) - set(task_names_train)
unique_to_list2 = set(task_names_train) - set(task_names_manual)
print("task_name only in task_names_manual:", unique_to_list1)
print("task_name only in task_names_train:", unique_to_list2)
assert Counter(task_names_manual) == Counter(task_names_train)


All identified files:  ['./datasets_clean/ds_wellformedquery_train_cl.gzip', './datasets_clean/ds_rottentomatoes_train_cl.gzip', './datasets_clean/ds_amazonpolarity_train_cl.gzip', './datasets_clean/ds_imdb_train_cl.gzip', './datasets_clean/ds_hatexplain_train_cl.gzip', './datasets_clean/ds_massive_train_cl.gzip', './datasets_clean/ds_yelpreviews_train_cl.gzip', './datasets_clean/ds_banking77_train_cl.gzip', './datasets_clean/ds_emotiondair_train_cl.gzip', './datasets_clean/ds_emocontext_train_cl.gzip', './datasets_clean/ds_empathetic_train_cl.gzip', './datasets_clean/ds_agnews_train_cl.gzip', './datasets_clean/ds_biasframes_offensive_train_cl.gzip', './datasets_clean/ds_yahootopics_train_cl.gzip', './datasets_clean/ds_biasframes_sex_train_cl.gzip', './datasets_clean/ds_biasframes_intent_train_cl.gzip', './datasets_clean/ds_financialphrasebank_train_cl.gzip', './datasets_clean/ds_appreviews_train_cl.gzip', './datasets_clean/ds_hateoffensive_train_cl.gzip', './datasets_clean/ds_spam_tra

In [None]:
# load train files
df_train_lst = []
for train_file_path in train_files_lst:
    # extract task name from file path
    pattern = re.compile(r'^\.(?:/datasets_clean|/datasets_standardized)/ds_|_train.*$')
    task_name = re.sub(pattern, '', train_file_path)

    # for datasets that do not have hypotheses yet
    if not any(task_name in train_file_path for task_name in tasks_to_exclude):
        print("Loading unformatted file: ", train_file_path)
        # load dataset
        df_train = pd.read_parquet(train_file_path).reset_index(drop=True)

        df_train["task_name"] = task_name

        df_train_lst.append(df_train)

    # for datasets that already have hypotheses and are already formatted
    else:
        # later: include anthropic datasets in v2
        raise NotImplementedError


df_train_concat = pd.concat(df_train_lst)

display(df_train_concat)

Loading unformatted file:  ./datasets_clean/ds_wellformedquery_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_rottentomatoes_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_amazonpolarity_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_imdb_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_hatexplain_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_massive_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_yelpreviews_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_banking77_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_emotiondair_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_emocontext_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_empathetic_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_agnews_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_biasframes_offensive_train_cl.gzip
Loading unformatted file:  ./datasets_clean/ds_yahoot

Unnamed: 0,text,label_text,label_standard,label_quality,task_name
0,The meaning of VJ day ?,not_well_formed,0,0.834150,wellformedquery
1,Who is gerard ways Gf ?,not_well_formed,0,0.625953,wellformedquery
2,Praying mantis are eatan by who ?,not_well_formed,0,0.933964,wellformedquery
3,What kind of animals live ia africa ?,not_well_formed,0,0.516223,wellformedquery
4,What it Critical angle ?,not_well_formed,0,0.753376,wellformedquery
...,...,...,...,...,...
35815,Summary:\nA flight from Los Angeles to London ...,factually_inconsistent,1,,trueteacher
35816,Summary:\nThe body of a man found in his home ...,factually_consistent,0,,trueteacher
35817,Summary:\nA video has been released of a robbe...,factually_inconsistent,1,,trueteacher
35818,"Summary:\n""Therein lies the beauty of skiing,""...",factually_consistent,0,,trueteacher


In [None]:
# downsample per class and per task
n_max_sample_per_class = 500
df_train_concat_samp = df_train_concat.groupby(["task_name", "label_text"], group_keys=False, as_index=False).apply(lambda x: x.sample(n=min(len(x), n_max_sample_per_class), random_state=SEED_GLOBAL))
n_max_sample_per_task = 5_000
df_train_concat_samp = df_train_concat_samp.groupby(["task_name"], group_keys=False, as_index=False).apply(lambda x: x.sample(n=min(len(x), n_max_sample_per_task), random_state=SEED_GLOBAL))

# compare data distributions
data_distribution = df_train_concat.task_name.value_counts(ascending=False)
data_distribution_downsampled = df_train_concat_samp.task_name.value_counts(ascending=False)
print("n_texts before downsampling: ", data_distribution.sum())
print("n_texts after downsampling: ", data_distribution_downsampled.sum())

df_data_distribution = pd.DataFrame({
    "n_texts_all": data_distribution,
    "n_texts_downsample": data_distribution_downsampled
}).sort_values("n_texts_all", ascending=False)

display(df_data_distribution)


n_texts before downsampling:  408807
n_texts after downsampling:  50647


Unnamed: 0,n_texts_all,n_texts_downsample
yahootopics,65456,5000
agnews,35967,2000
trueteacher,35820,1000
yelpreviews,19334,1000
emocontext,19240,2000
amazonpolarity,19099,1000
imdb,18450,1000
wikitoxic_toxicaggregated,18218,1000
wikitoxic_obscene,17120,1000
biasframes_intent,16892,1000


#### Format df_train to NLI format

In [None]:

# format train set
df_train_format_lst = []
for key_task_name, value_df_task_train in df_train_concat_samp.groupby("task_name"):

    # select hypotheses for task
    hypo_label_dic = task_hypotheses[key_task_name]

    df_train_format = format_nli_trainset(
        df_train=value_df_task_train, hypo_label_dic=hypo_label_dic,
        random_seed=SEED_GLOBAL
    )

    # remove rows where no hypothesis was formulated due to noise
    df_train_format = df_train_format[~df_train_format.hypothesis.isna()]

    df_train_format_lst.append(df_train_format)


df_train_format_concat = pd.concat(df_train_format_lst)

#df_train_format_concat[df_train_format_concat["task_name"] == "manifesto"].label_text.value_counts()


For NLI: Augmenting data by adding random not_entail examples to the train set from other classes within the train set.
Length of df_train before this step is: 2000.

Max augmentation can be: len(df_train) * 2 = 4000. Can also be lower, if there are more entail examples than not-entail for a majority class
For NLI:  not_entail training examples were added, which leads to an augmented training dataset of length 4000.

For NLI: Augmenting data by adding random not_entail examples to the train set from other classes within the train set.
Length of df_train before this step is: 1000.

Max augmentation can be: len(df_train) * 2 = 2000. Can also be lower, if there are more entail examples than not-entail for a majority class
For NLI:  not_entail training examples were added, which leads to an augmented training dataset of length 2000.

For NLI: Augmenting data by adding random not_entail examples to the train set from other classes within the train set.
Length of df_train before this step i

In [None]:
df_train_format_concat = df_train_format_concat[["text", "label_text", "labels", "hypothesis", "task_name", "label_quality"]]
print(df_train_format_concat.task_name.value_counts())

display(df_train_format_concat)

# test if dataset mismatch
task_names_train_df = df_train_format_concat.task_name.unique().tolist()
unique_to_list1 = set(task_names_manual) - set(task_names_train_df)
unique_to_list2 = set(task_names_train_df) - set(task_names_manual)
print("Strings unique to list1:", unique_to_list1)
print("Strings unique to list2:", unique_to_list2)
assert Counter(task_names_manual) == Counter(task_names_train_df)

yahootopics                  10000
manifesto                    10000
massive                       9794
banking77                     9508
emotiondair                   5036
capsotu                       4648
empathetic                    4226
emocontext                    4000
agnews                        4000
hatexplain                    2958
financialphrasebank           2524
hateoffensive                 2152
trueteacher                   2000
wikitoxic_insult              2000
wikitoxic_identityhate        2000
wikitoxic_obscene             2000
wikitoxic_toxicaggregated     2000
wellformedquery               2000
imdb                          2000
rottentomatoes                2000
amazonpolarity                2000
biasframes_sex                2000
biasframes_offensive          2000
biasframes_intent             2000
appreviews                    2000
yelpreviews                   2000
spam                          1865
wikitoxic_threat              1760
Name: task_name, dty

Unnamed: 0,text,label_text,labels,hypothesis,task_name,label_quality
24719,Rooney #39;blackmail #39; trio are cleared LO...,Sports,1,This example news text is about business news,agnews,0.900255
29067,No Sorry from the Queen Queen Elizabeth II sto...,World,0,This example news text is about world news,agnews,0.963161
13750,Visionaries outline web's future The web will ...,Sci/Tech,1,This example news text is about business news,agnews,0.998342
10002,Airbus' Magic In-flight Cell Phone Technology ...,Sci/Tech,1,This example news text is about world news,agnews,0.938365
8933,FCC Rules in Favor of Net Phone Industry (AP) ...,Sci/Tech,1,This example news text is about sports,agnews,0.935555
...,...,...,...,...,...,...
17215,This place is definitely worth checking out. T...,positive,0,The sentiment in this example yelp review is p...,yelpreviews,0.996761
12677,I love this place! I even moved about 15 minut...,positive,0,The sentiment in this example yelp review is p...,yelpreviews,0.983336
14507,Can we say handmade? People that care to crea...,positive,1,The sentiment in this example yelp review is n...,yelpreviews,0.954570
10305,Food was fantastic had the paneer tikka masala...,positive,0,The sentiment in this example yelp review is p...,yelpreviews,0.994607


Strings unique to list1: set()
Strings unique to list2: set()


#### Load df_test

In [None]:
## load test files

# find cleaned files
directory_path_test = './datasets_standardized'
test_files_lst = find_files(directory_path_test, filter_string="test")

# remove selected tasks
test_files_lst = [path for path in test_files_lst if not any(task in path for task in tasks_to_exclude)]

print("All identified files: ", test_files_lst)
print("N files: ", len(test_files_lst))

# task name extraction from dataset files
# Define the regular expression pattern
pattern = re.compile(r'^\.(?:/datasets_clean|/datasets_standardized)/ds_|_test.*$')
# Apply the regular expression to each string in the list
task_names_test = [re.sub(pattern, '', fp) for fp in test_files_lst]

print("task names: ", task_names_test)

# double check if all intended dataset paths and task names  were loaded
# manually written task names for testing
print("N tasks: ", len(task_names_manual))

# test if dataset mismatch
unique_to_list1 = set(task_names_manual) - set(task_names_test)
unique_to_list2 = set(task_names_test) - set(task_names_manual)
print("task_name only in task_names_manual:", unique_to_list1)
print("task_name only in task_names_test:", unique_to_list2)
assert Counter(task_names_manual) == Counter(task_names_test)


All identified files:  ['./datasets_standardized/ds_wellformedquery_test.gzip', './datasets_standardized/ds_rottentomatoes_test.gzip', './datasets_standardized/ds_amazonpolarity_test.gzip', './datasets_standardized/ds_imdb_test.gzip', './datasets_standardized/ds_yelpreviews_test.gzip', './datasets_standardized/ds_hatexplain_test.gzip', './datasets_standardized/ds_massive_test.gzip', './datasets_standardized/ds_banking77_test.gzip', './datasets_standardized/ds_emotiondair_test.gzip', './datasets_standardized/ds_emocontext_test.gzip', './datasets_standardized/ds_empathetic_test.gzip', './datasets_standardized/ds_agnews_test.gzip', './datasets_standardized/ds_yahootopics_test.gzip', './datasets_standardized/ds_biasframes_sex_test.gzip', './datasets_standardized/ds_biasframes_offensive_test.gzip', './datasets_standardized/ds_biasframes_intent_test.gzip', './datasets_standardized/ds_financialphrasebank_test.gzip', './datasets_standardized/ds_appreviews_test.gzip', './datasets_standardized/d

In [None]:
# load and downsample test data
# 5000 test data per class should be enough for good metrics and speeds things up
n_max_sample_per_class = 5_000

# load test set
df_test_dic = {}
for test_file_path in test_files_lst:
    # extract task name from file path
    pattern = re.compile(r'^\.(?:/datasets_standardized)/ds_|_test.*$')  #(r'^\.(?:/datasets_clean|/datasets_standardized)/ds_|_test.*$')
    task_name = re.sub(pattern, '', test_file_path)

    # for datasets that do not have hypotheses yet
    if not any(task_name in test_file_path for task_name in tasks_to_exclude):
        print("Loading unformatted file: ", test_file_path)

        # load dataset
        df_test = pd.read_parquet(test_file_path).reset_index(drop=True)

        # downsample for faster testing
        df_test_samp = df_test.groupby("label_standard", group_keys=False, as_index=False).apply(
            lambda x: x.sample(n=min(len(x), n_max_sample_per_class), random_state=SEED_GLOBAL)
        ).reset_index(drop=True)

        df_test_dic.update({task_name: df_test_samp.reset_index(drop=True)})

    # for datasets that already have hypotheses and are already formatted
    else:
        # later: include anthropic datasets in v2
        raise NotImplementedError


df_test_dic.keys()

#df_test_dic["manifesto"].label_text.value_counts()
#df_test_dic["manifesto"]

Loading unformatted file:  ./datasets_standardized/ds_wellformedquery_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_rottentomatoes_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_amazonpolarity_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_imdb_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_yelpreviews_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_hatexplain_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_massive_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_banking77_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_emotiondair_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_emocontext_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_empathetic_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_agnews_test.gzip
Loading unformatted file:  ./datasets_standardized/ds_yahootopics_test.gzip
Loading unformatted fil

dict_keys(['wellformedquery', 'rottentomatoes', 'amazonpolarity', 'imdb', 'yelpreviews', 'hatexplain', 'massive', 'banking77', 'emotiondair', 'emocontext', 'empathetic', 'agnews', 'yahootopics', 'biasframes_sex', 'biasframes_offensive', 'biasframes_intent', 'financialphrasebank', 'appreviews', 'hateoffensive', 'trueteacher', 'spam', 'wikitoxic_toxicaggregated', 'wikitoxic_obscene', 'wikitoxic_identityhate', 'wikitoxic_threat', 'wikitoxic_insult', 'manifesto', 'capsotu'])

#### Format df_test to NLI format

In [None]:

# format test set
df_test_format_dic = {}
for key_task_name, value_df_test in df_test_dic.items():

    # select hypotheses for task
    hypo_label_dic = task_hypotheses[key_task_name]

    df_test_format = format_nli_testset(
        df_test=value_df_test, hypo_label_dic=hypo_label_dic,
    )

    df_test_format["task_name"] = key_task_name

    # remove rows where no hypothesis was formulated due to noise
    df_test_format = df_test_format[~df_test_format.hypothesis.isna()]

    df_test_format_dic.update({key_task_name: df_test_format[["text", "label_text", "labels", "hypothesis", "task_name"]].reset_index(drop=True)})
    print("\n")


df_test_format_dic.keys()

#df_test_format_dic["manifesto"]

Number of hypotheses/classes:  2 

For normal test, N classifications necessary: 2967
Reminder: do not sample these test-sets anymore after formatting. Row order needs to be preserved for testing.
For NLI test, N classifications necessary: 5934



Number of hypotheses/classes:  2 

For normal test, N classifications necessary: 1066
Reminder: do not sample these test-sets anymore after formatting. Row order needs to be preserved for testing.
For NLI test, N classifications necessary: 2132



Number of hypotheses/classes:  2 

For normal test, N classifications necessary: 10000
Reminder: do not sample these test-sets anymore after formatting. Row order needs to be preserved for testing.
For NLI test, N classifications necessary: 20000



Number of hypotheses/classes:  2 

For normal test, N classifications necessary: 10000
Reminder: do not sample these test-sets anymore after formatting. Row order needs to be preserved for testing.
For NLI test, N classifications necessary: 20000



Numb

dict_keys(['wellformedquery', 'rottentomatoes', 'amazonpolarity', 'imdb', 'yelpreviews', 'hatexplain', 'massive', 'banking77', 'emotiondair', 'emocontext', 'empathetic', 'agnews', 'yahootopics', 'biasframes_sex', 'biasframes_offensive', 'biasframes_intent', 'financialphrasebank', 'appreviews', 'hateoffensive', 'trueteacher', 'spam', 'wikitoxic_toxicaggregated', 'wikitoxic_obscene', 'wikitoxic_identityhate', 'wikitoxic_threat', 'wikitoxic_insult', 'manifesto', 'capsotu'])

In [None]:
# inspect distribution in testset (after downsampling)

n_texts_per_task_dic = {key_task: len(value_df) for key_task, value_df in df_test_format_dic.items()}

print("n_texts after downsampling: ", sum(n_texts_per_task_dic.values()), "\n")
print("n_texts per class downsampling:\n", pd.Series(n_texts_per_task_dic).sort_values(ascending=False), "\n")
assert len(n_texts_per_task_dic) == len(task_names_manual)


n_texts after downsampling:  2269955 

n_texts per class downsampling:
 manifesto                    953008
yahootopics                  500000
banking77                    221760
massive                      175466
empathetic                    81344
capsotu                       70455
agnews                        30400
emocontext                    22036
yelpreviews                   20000
imdb                          20000
wikitoxic_toxicaggregated     20000
amazonpolarity                20000
trueteacher                   17910
wikitoxic_obscene             17382
wikitoxic_insult              16854
emotiondair                   12000
wikitoxic_identityhate        11424
wikitoxic_threat              10422
biasframes_sex                 8808
appreviews                     8000
biasframes_offensive           7676
biasframes_intent              7296
wellformedquery                5934
hatexplain                     2922
hateoffensive                  2586
rottentomatoes              

### Load and wrangle NLI datasets

In [None]:
dataset_nli_train_dict = load_dataset('MoritzLaurer/mnli_anli_fevernli_wanli_lingnli_xnli_train')
dataset_nli_test_dict = load_dataset('MoritzLaurer/mnli_anli_fevernli_wanli_lingnli_xnli_test')


In [None]:
NLI_DATASETS_TO_USE = ["mnli", "anli", "fevernli", "wanli", "lingnli"]  # "mnli", "anli", "fever", "wanli", "ling", "xnli"

# removing specific NLI dataset. e.g. XNLI
keys_to_remove_train = [key for key in dataset_nli_train_dict if not any(nli_dataset in key for nli_dataset in NLI_DATASETS_TO_USE)]
keys_to_remove_test = [key for key in dataset_nli_test_dict if not any(nli_dataset in key for nli_dataset in NLI_DATASETS_TO_USE)]

for key in keys_to_remove_train:
    del dataset_nli_train_dict[key]
for key in keys_to_remove_test:
    del dataset_nli_test_dict[key]

# remove unnecessary language column
for dataset_name in dataset_nli_train_dict:
    dataset_nli_train_dict[dataset_name] = dataset_nli_train_dict[dataset_name].remove_columns('language')

print(dataset_nli_train_dict)
print(dataset_nli_test_dict)

DatasetDict({
    mnli: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 392702
    })
    fevernli: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 196805
    })
    anli: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 162865
    })
    wanli: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 102885
    })
    lingnli: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 29985
    })
})
DatasetDict({
    mnli_m: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9815
    })
    mnli_mm: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9832
    })
    fevernli: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 19652
    })
    anli_r1: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
   

#### NLI datasets concatenation and formatting

In [None]:
## add task_name column to NLI datasets for downstream inspection possibilities
for key_dataset_name, value_dataset in dataset_nli_train_dict.items():
    dataset_nli_train_dict[key_dataset_name] = dataset_nli_train_dict[key_dataset_name].add_column(name="task_name", column=[key_dataset_name] * len(value_dataset))
for key_dataset_name, value_dataset in dataset_nli_test_dict.items():
    dataset_nli_test_dict[key_dataset_name] = dataset_nli_test_dict[key_dataset_name].add_column(name="task_name", column=[key_dataset_name] * len(value_dataset))


In [None]:
## NLI datasets concatenation

## train set
# print train sets
print("TRAIN SETS: ")
for dataset_name in NLI_DATASETS_TO_USE:
    print(dataset_name, " number of train examples: ", len(dataset_nli_train_dict[dataset_name]))
# concatenate all selected trainsets
dataset_train_nli = concatenate_datasets([value_dataset for key_dataset_name, value_dataset in dataset_nli_train_dict.items()])
dataset_train_nli = dataset_train_nli.shuffle(seed=SEED_GLOBAL)

## test set
# disaggregated test set
dataset_test_disaggregated_nli = DatasetDict(**dataset_nli_test_dict)

# aggregated NLI test set for single metrics for choosing best model over all NLI datasets
# increase weight of selected test-sets
dataset_test_concat_nli = []
for key_dataset_name, value_dataset in dataset_nli_test_dict.items():
    if "anli" in key_dataset_name:
        # increase weight of anli in overall testset, because small compared to others
        value_dataset = concatenate_datasets([value_dataset, value_dataset, value_dataset])
        dataset_test_concat_nli.append(value_dataset)
    elif "fevernli" in key_dataset_name:
        # downsample fever in aggregated testset
        value_dataset = value_dataset.select(random.sample(range(0, len(value_dataset)), 10_000))
        dataset_test_concat_nli.append(value_dataset)
    elif "xnli" in key_dataset_name:
        raise NotImplementedError
    else:
        dataset_test_concat_nli.append(value_dataset)

dataset_test_concat_nli = concatenate_datasets(dataset_test_concat_nli)


# harmonise label column name
dataset_train_nli = dataset_train_nli.rename_columns({"label": "labels", "premise": "text"})
dataset_test_concat_nli = dataset_test_concat_nli.rename_columns({"label": "labels", "premise": "text"})
dataset_test_disaggregated_nli = dataset_test_disaggregated_nli.rename_columns({"label": "labels", "premise": "text"})
print("Full train set: ", len(dataset_train_nli))
print("\nAll available test sets for disaggregated testing: ", dataset_test_disaggregated_nli)
print("\nAll available test data for aggregated testing: ", dataset_test_concat_nli)


TRAIN SETS: 
mnli  number of train examples:  392702
anli  number of train examples:  162865
fevernli  number of train examples:  196805
wanli  number of train examples:  102885
lingnli  number of train examples:  29985
Full train set:  885242

All available test sets for disaggregated testing:  DatasetDict({
    mnli_m: Dataset({
        features: ['text', 'hypothesis', 'labels', 'task_name'],
        num_rows: 9815
    })
    mnli_mm: Dataset({
        features: ['text', 'hypothesis', 'labels', 'task_name'],
        num_rows: 9832
    })
    fevernli: Dataset({
        features: ['text', 'hypothesis', 'labels', 'task_name'],
        num_rows: 19652
    })
    anli_r1: Dataset({
        features: ['text', 'hypothesis', 'labels', 'task_name'],
        num_rows: 1000
    })
    anli_r2: Dataset({
        features: ['text', 'hypothesis', 'labels', 'task_name'],
        num_rows: 1000
    })
    anli_r3: Dataset({
        features: ['text', 'hypothesis', 'labels', 'task_name'],
        nu

In [None]:
## make nli datasets binary

def binarize_labels(example):
    if example["labels"] >= 1:
        labels = 1
    else:
        labels = 0
    return {"labels": labels}

dataset_train_nli = dataset_train_nli.map(binarize_labels)
dataset_test_concat_nli = dataset_test_concat_nli.map(binarize_labels)
dataset_test_disaggregated_nli = dataset_test_disaggregated_nli.map(binarize_labels)

# add new binarized feature names
new_features = dataset_train_nli.features.copy()
new_features['labels'] = ClassLabel(names=["entailment", "not_entailment"])

dataset_train_nli = dataset_train_nli.cast(new_features)
dataset_test_concat_nli = dataset_test_concat_nli.cast(new_features)
dataset_test_disaggregated_nli = dataset_test_disaggregated_nli.cast(new_features)

# add label_text column to enable upload of DatasetDict with both NLI and not-NLI data (need same columns)
label_text_map = {0: "entailment", 1: "not_entailment"}
for key_taskname in dataset_test_disaggregated_nli:
    dataset_test_disaggregated_nli[key_taskname] = dataset_test_disaggregated_nli[key_taskname].map(lambda x: {"label_text": label_text_map[x["labels"]]})

print(dataset_train_nli.features["labels"].names)

Map:   0%|          | 0/59140 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/59140 [00:00<?, ? examples/s]

['entailment', 'not_entailment']


### Combine NLI datasets with non-NLI datasets

In [None]:
# harmonise non-nli datasets in HF dataset format
dataset_train_not_nli = Dataset.from_pandas(df_train_format_concat.reset_index(drop=True))
dataset_train_not_nli = dataset_train_not_nli.remove_columns(["label_quality"])

dataset_test_not_nli = DatasetDict({
    key_task: Dataset.from_pandas(value_df)
    for key_task, value_df in df_test_format_dic.items()
})

new_features = dataset_train_not_nli.features.copy()
new_features['labels'] = ClassLabel(names=["entailment", "not_entailment"])
dataset_train_not_nli = dataset_train_not_nli.cast(new_features)
dataset_test_not_nli = dataset_test_not_nli.cast(new_features)

print(dataset_train_not_nli)
print(dataset_test_not_nli)

Casting the dataset:   0%|          | 0/100471 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5934 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2132 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2922 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/175466 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/221760 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/22036 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/81344 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/30400 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/500000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8808 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7676 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7296 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2070 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2586 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/17910 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2070 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/17382 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11424 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10422 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16854 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/953008 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/70455 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
    num_rows: 100471
})
DatasetDict({
    wellformedquery: Dataset({
        features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
        num_rows: 5934
    })
    rottentomatoes: Dataset({
        features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
        num_rows: 2132
    })
    amazonpolarity: Dataset({
        features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
        num_rows: 20000
    })
    imdb: Dataset({
        features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
        num_rows: 20000
    })
    yelpreviews: Dataset({
        features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
        num_rows: 20000
    })
    hatexplain: Dataset({
        features: ['text', 'label_text', 'labels', 'hypothesis', 'task_name'],
        num_rows: 2922
    })
    massive: Dataset({
        features: ['text', 'label

In [None]:
# final harmonized datasets

dataset_train = concatenate_datasets([dataset_train_nli, dataset_train_not_nli])
dataset_train = dataset_train.shuffle(seed=SEED_GLOBAL)

#dataset_test_concat_nli = dataset_test_concat_nli.shuffle(seed=SEED_GLOBAL)
dataset_test_concat_nli

dataset_test_disaggregated = DatasetDict({**dataset_test_disaggregated_nli, **dataset_test_not_nli})


### Save final train and test sets to disk

In [None]:
store_data = True

if store_data:
    # save to disk
    dataset_train.save_to_disk("./datasets_final/dataset_train")
    dataset_test_concat_nli.save_to_disk("./datasets_final/dataset_test_concat_nli")
    dataset_test_disaggregated.save_to_disk("./datasets_final/dataset_test_disaggregated")

    # push to hub
    dataset_train.push_to_hub(repo_id="dataset_train_nli", private=True, token=config.HF_ACCESS_TOKEN)
    dataset_test_concat_nli.push_to_hub(repo_id="dataset_test_concat_nli", private=True, token=config.HF_ACCESS_TOKEN)
    dataset_test_disaggregated.push_to_hub(repo_id="dataset_test_disaggregated_nli", private=True, token=config.HF_ACCESS_TOKEN)


Saving the dataset (0/1 shards):   0%|          | 0/985255 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/59140 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9815 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9832 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19652 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4893 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5934 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2132 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2922 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/175466 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/221760 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/22036 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/81344 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8808 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7296 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2070 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2586 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17910 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2070 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11424 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10422 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16854 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/953008 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70455 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/986 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/691 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/649 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/954 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/71 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/4.97k [00:00<?, ?B/s]