<a href="https://colab.research.google.com/github/MoritzLaurer/zeroshot-classifier/blob/main/1_data_harmonization_nli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download and harmonization of diverse NLI datasets
These datasets are already designed for Natural Language Inference (NLI)

### Install and setup

In [None]:
!pip install datasets~=2.14.0 -qq
!pip install huggingface_hub~=0.18.0 -qq


In [None]:
## load packages
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from datasets import ClassLabel

from google.colab.data_table import DataTable
from google.colab import data_table
from IPython.display import display
data_table.enable_dataframe_formatter() # https://colab.research.google.com/notebooks/data_table.ipynb#scrollTo=JgBtx0xFFv_i

## set global seed for reproducibility and against seed hacking
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

In [None]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

#set wd
print(os.getcwd())
os.chdir("/content/drive/My Drive/PhD/zero-shot-models")
print(os.getcwd())

# local config.py file with tokens
import config

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/My Drive/PhD/zero-shot-models


### Load NLI datasets

In [None]:
# container for selected datasets
dic_datasets_train = {}
dic_datasets_test = {}


In [None]:
### SNLI
# too noisy
#dataset_train_snli = load_dataset('snli', split="train")
#dataset_test_snli = load_dataset('snli', split="test")

### other potential datasets:
# climatefever: 1.5k climate related claims. questionable quality: low krippendorf alpha ~0.3  https://arxiv.org/abs/2012.00614
# feverours: includes info from tables


#### MNLI

In [None]:
dataset_mnli = load_dataset('multi_nli')  # split='train'

# create final dataset
dataset_mnli = dataset_mnli.remove_columns(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
print(dataset_mnli)
print(dataset_mnli["train"].features)

# write to general dic containers
dic_datasets_train.update({"mnli": dataset_mnli["train"]})
dic_datasets_test.update({"mnli_m": dataset_mnli["validation_matched"], "mnli_mm": dataset_mnli["validation_mismatched"]})

del (dataset_mnli)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9832
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}


#### FEVER-NLI

In [None]:
dataset_fever = load_dataset("pietrolesci/nli_fever")

# test set does not have labels. use dev for final test
# data has duplicates for some reason
df_fever_train = dataset_fever["train"].to_pandas().drop_duplicates(subset=['premise', 'hypothesis', 'label'], keep='first')  # removes around 12k texts. some duplicate texts have different labels. maintaining them by including 'labels' to reflect ambiguity
df_fever_dev = dataset_fever["dev"].to_pandas().drop_duplicates(subset=['premise', 'hypothesis', 'label'], keep="first")
# remove unnecessary columns
df_fever_train = df_fever_train[["premise", "hypothesis", "label"]].reset_index(drop=True)
df_fever_dev = df_fever_dev[["premise", "hypothesis", "label"]].reset_index(drop=True)

# create final dataset
dataset_fever = DatasetDict({"train": Dataset.from_pandas(df_fever_train), "test": Dataset.from_pandas(df_fever_dev)})  # "test": dataset_dev_fever_samp
print(dataset_fever)
print(dataset_fever["train"].features)

# add classlabel names for NLI
new_features = dataset_fever["train"].features.copy()
new_features['label'] = ClassLabel(names=["entailment", "neutral", "contradiction"])
dataset_fever = dataset_fever.cast(new_features)
print(dataset_fever)
print(dataset_fever["train"].features)

# write to general dic container
dic_datasets_train.update({"fevernli": dataset_fever["train"]})
dic_datasets_test.update({"fevernli": dataset_fever["test"]})

del (df_fever_train, df_fever_dev, dataset_fever)


Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 196805
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 19652
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


Casting the dataset:   0%|          | 0/196805 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19652 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 196805
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 19652
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}


#### ANLI

In [None]:
# train
dataset_train_anli = load_dataset('anli', split=["train_r1", "train_r2", "train_r3"])
dataset_train_anli = concatenate_datasets([dataset_train_anli[0], dataset_train_anli[1], dataset_train_anli[2]])
dataset_train_anli = dataset_train_anli.remove_columns(["uid", "reason"])
# test
dataset_test_anli = load_dataset('anli', split=["test_r1", "test_r2", "test_r3"])
dataset_test_anli = DatasetDict({f"test_r{i+1}": dataset for i, dataset in enumerate(dataset_test_anli)})
dataset_test_anli = dataset_test_anli.remove_columns(["uid", "reason"])

# create final dataset
dataset_anli = DatasetDict({"train": dataset_train_anli, **dataset_test_anli})
print(dataset_anli)
print(dataset_anli["train"].features)

# write to general dic container
dic_datasets_train.update({"anli": dataset_anli["train"]})
dic_datasets_test.update({"anli_r1": dataset_anli["test_r1"], "anli_r2": dataset_anli["test_r2"], "anli_r3": dataset_anli["test_r3"]})
# tripple weight of anli test set, because smaller
#dic_datasets_test_concat.update({"anli": concatenate_datasets([dataset_anli["test_all"], dataset_anli["test_all"], dataset_anli["test_all"]])})

del (dataset_train_anli, dataset_test_anli, dataset_anli)


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 162865
    })
    test_r1: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test_r3: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1200
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}


#### WANLI

In [None]:
dataset_wanli = load_dataset("alisawuffles/WANLI")
dataset_wanli = dataset_wanli.remove_columns(['pairID', 'genre', 'id'])
dataset_wanli = dataset_wanli.rename_column("gold", "label")

# unelegant conversion from text to int necessary to make casting below work
def test(label):
    if label == "entailment":
        return 0
    elif label == "neutral":
        return 1
    elif label == "contradiction":
        return 2
label_num = [test(label) for label in dataset_wanli["train"]["label"]]
dataset_wanli["train"] = dataset_wanli["train"].remove_columns(["label"])
dataset_wanli["train"] = dataset_wanli["train"].add_column("label", label_num)
label_num = [test(label) for label in dataset_wanli["test"]["label"]]
dataset_wanli["test"] = dataset_wanli["test"].remove_columns(["label"])
dataset_wanli["test"] = dataset_wanli["test"].add_column("label", label_num)

# adapt label column
new_features = dataset_wanli["train"].features.copy()
new_features['label'] = ClassLabel(names=["entailment", "neutral", "contradiction"])  # the label ids are attributed in the (non-alphabetical) order to the list
dataset_wanli = dataset_wanli.cast(new_features)

# print
print(dataset_wanli)
print(dataset_wanli["train"].features)

# write to general dic container
dic_datasets_train.update({"wanli": dataset_wanli["train"]})
dic_datasets_test.update({"wanli": dataset_wanli["test"]})

del (dataset_wanli)


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 102885
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 5000
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}


#### LING-NLI

In [None]:
# Linguist in the loop NLI data
links_dict_all = {
    #"train_round5_baseline_combined": "https://raw.githubusercontent.com/Alicia-Parrish/ling_in_loop/master/NLI_data/1_Baseline_protocol/train_round5_baseline_combined.jsonl",
    #"val_round5_baseline_combined": "https://raw.githubusercontent.com/Alicia-Parrish/ling_in_loop/master/NLI_data/1_Baseline_protocol/val_round5_base_combined.jsonl",
    "train_round5_litl_combined": "https://raw.githubusercontent.com/Alicia-Parrish/ling_in_loop/master/NLI_data/3_Ling_in_loop_protocol/train_round5_LitL_combined.jsonl",
    "val_round5_litl_combined": "https://raw.githubusercontent.com/Alicia-Parrish/ling_in_loop/master/NLI_data/3_Ling_in_loop_protocol/val_round5_LitL_combined.jsonl",
    "train_round5_lots_combined": "https://raw.githubusercontent.com/Alicia-Parrish/ling_in_loop/master/NLI_data/2_Ling_on_side_protocol/train_round5_LotS_combined.jsonl",
    "val_round5_lots_combined": "https://raw.githubusercontent.com/Alicia-Parrish/ling_in_loop/master/NLI_data/2_Ling_on_side_protocol/val_round5_LotS_combined.jsonl",
}
df_train_ling_lst = []
df_val_ling_lst = []
for dataset_name, link in links_dict_all.items():
    if "train" in dataset_name:
        df_step = pd.read_json(link, lines=True)
        df_step["dataset_type"] = dataset_name
        df_train_ling_lst.append(df_step)
    elif "val" in dataset_name:
        df_step = pd.read_json(link, lines=True)
        df_step["dataset_type"] = dataset_name
        df_val_ling_lst.append(df_step)

df_ling_train = pd.concat(df_train_ling_lst)
df_ling_val = pd.concat(df_val_ling_lst)
df_ling_train_cl = df_ling_train[["premise", "hypothesis", "label"]]
df_ling_val_cl = df_ling_val[["premise", "hypothesis", "label"]]

label_map_ling = {"entailment": 0, "neutral": 1, "contradiction": 2}
df_ling_train_cl.label = df_ling_train_cl.label.map(label_map_ling)
df_ling_val_cl.label = df_ling_val_cl.label.map(label_map_ling)
dataset_ling = DatasetDict({'train': Dataset.from_pandas(df_ling_train_cl), 'test': Dataset.from_pandas(df_ling_val_cl)}).remove_columns(['__index_level_0__'])

new_features = dataset_ling["train"].features.copy()
new_features['label'] = ClassLabel(names=["entailment", "neutral", "contradiction"])  # the label ids are attributed in the (non-alphabetical) order to the list
dataset_ling = dataset_ling.cast(new_features)

# print
print(dataset_ling)
print(dataset_ling["train"].features)

# write to general dic container
dic_datasets_train.update({"lingnli": dataset_ling["train"]})
dic_datasets_test.update({"lingnli": dataset_ling["test"]})
#dic_datasets_test_concat.update({"ling": dataset_ling["test"]})

del (dataset_ling, df_train_ling_lst, df_val_ling_lst, df_ling_train, df_ling_val, df_ling_train_cl, df_ling_val_cl)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ling_train_cl.label = df_ling_train_cl.label.map(label_map_ling)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ling_val_cl.label = df_ling_val_cl.label.map(label_map_ling)


Casting the dataset:   0%|          | 0/29985 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4893 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 29985
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 4893
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}


#### XNLI

In [None]:
# Multilingual XNLI
dataset_xnli_raw = load_dataset("xnli", "all_languages", split=["validation", "test"])

# put in proper dataset dict
dataset_xnli_raw = DatasetDict({"validation": dataset_xnli_raw[0], "test": dataset_xnli_raw[1]})
print("\nRaw xnli: ", dataset_xnli_raw)

## clean & augment nested data structure of XNLI

xnli_dict = {}
for dataset_name in dataset_xnli_raw:
    if dataset_name != "train":  # train test takes way too long and don't need it
        premise_lst = []
        language_lst = []
        hypothesis_lst = []
        label_lst = []
        for data_point in dataset_xnli_raw[dataset_name]:
          # create test set
          if dataset_name == "test":
            premise_lst.append(list(data_point["premise"].values()))
            language_lst.append(data_point["hypothesis"]["language"])
            hypothesis_lst.append(data_point["hypothesis"]["translation"])
            label_lst.append([data_point["label"]] * len(data_point["hypothesis"]["translation"]))
          # normal source-source for premise-hypo
          if dataset_name != "test":
            premise_lst.append(list(data_point["premise"].values()))
            language_lst.append(data_point["hypothesis"]["language"])
            hypothesis_lst.append(data_point["hypothesis"]["translation"])
            label_lst.append([data_point["label"]] * len(data_point["hypothesis"]["translation"]))
        # unnest list
        premise_lst = [item for sublist in premise_lst for item in sublist]
        language_lst = [item for sublist in language_lst for item in sublist]
        hypothesis_lst = [item for sublist in hypothesis_lst for item in sublist]
        label_lst = [item for sublist in label_lst for item in sublist]
        # update master dict
        xnli_dict.update({dataset_name: Dataset.from_dict({"premise": premise_lst, "hypothesis": hypothesis_lst, "label": label_lst, "language": language_lst,})})

dataset_xnli = DatasetDict(xnli_dict)
print(dataset_xnli)

# harmonised label column type with MNLI etc.
from datasets import ClassLabel
new_features = dataset_xnli["validation"].features.copy()
new_features['label'] = ClassLabel(names=["entailment", "neutral", "contradiction"])
dataset_xnli = dataset_xnli.cast(new_features)
#dataset_xnli["validation"].features

# separate test datasets by language
xnli_dict_test_separated = {}
for group_name, group_df in pd.DataFrame(dataset_xnli["test"]).groupby(by="language"):
    xnli_dict_test_separated.update({"xnli_" + group_name: Dataset.from_pandas(group_df).remove_columns("__index_level_0__").cast(new_features)})

dataset_xnli["test_separated"] = DatasetDict(xnli_dict_test_separated)

# print
print(dataset_xnli)
print(dataset_xnli["validation"].features)

# write to general dic container
# ! augment XNLI in trainset
#dic_datasets_train.update({"xnli": concatenate_datasets([dataset_xnli["validation"], dataset_xnli["validation"]])})
dic_datasets_train.update({"xnli": dataset_xnli["validation"]})
dic_datasets_test.update({**dataset_xnli["test_separated"].remove_columns(["language"])})
#dic_datasets_test_concat.update({"xnli": dataset_xnli["test"]})
del(xnli_dict, dataset_xnli_raw, xnli_dict_test_separated)



Raw xnli:  DatasetDict({
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 2490
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 5010
    })
})
DatasetDict({
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'language'],
        num_rows: 37350
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'language'],
        num_rows: 75150
    })
})


Casting the dataset:   0%|          | 0/37350 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/75150 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5010 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'language'],
        num_rows: 37350
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'language'],
        num_rows: 75150
    })
    test_separated: DatasetDict({
        xnli_ar: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 5010
        })
        xnli_bg: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 5010
        })
        xnli_de: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 5010
        })
        xnli_el: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 5010
        })
        xnli_en: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 5010
        })
        xnli_es: Dataset({
          

## Format final dataset dictionary

In [None]:
dataset_train = DatasetDict(dic_datasets_train)
dataset_test = DatasetDict(dic_datasets_test)

# add language column for all datasets to enable upload
for dataset_name in dataset_train:
    # Check if the 'language' column is not in the dataset
    if 'language' not in dataset_train[dataset_name].column_names:
        # Add the 'language' column with default value 'en' to the dataset
        dataset_train[dataset_name] = dataset_train[dataset_name].add_column(
            'language', ['en'] * dataset_train[dataset_name].num_rows
        )


In [None]:
dataset_nli = DatasetDict({"train": dataset_train, "test": dataset_test})
dataset_nli

DatasetDict({
    train: DatasetDict({
        mnli: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 392702
        })
        fevernli: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 196805
        })
        anli: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 162865
        })
        wanli: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 102885
        })
        lingnli: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 29985
        })
        xnli: Dataset({
            features: ['premise', 'hypothesis', 'label', 'language'],
            num_rows: 37350
        })
    })
    test: DatasetDict({
        mnli_m: Dataset({
            features: ['premise', 'hypothesis', 'label'],
            num_rows: 9815
        }

#### Upload data to HF hub

In [None]:
# have to push two dictionaries separately. does not seem to work to upload a nested dictionary
dataset_nli["train"].push_to_hub("MoritzLaurer/mnli_anli_fevernli_wanli_lingnli_xnli_train", private=False, token=config.HF_ACCESS_TOKEN)
dataset_nli["test"].push_to_hub("MoritzLaurer/mnli_anli_fevernli_wanli_lingnli_xnli_test", private=False, token=config.HF_ACCESS_TOKEN)


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]